source/tools/toolutil/xmlparser.cpp - external/github.com/unicode-org/icu - Git at Google

 /*
 *******************************************************************************
 *
 *   Copyright (C) 2004-2006, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 *
 *******************************************************************************
 *   file name:  xmlparser.cpp
 *   encoding:   US-ASCII
 *   tab size:   8 (not used)
 *   indentation:4
 *
 *   created on: 2004jul21
 *   created by: Andy Heninger
 */

 #include <stdio.h>
 #include "unicode/uchar.h"
 #include "unicode/ucnv.h"
 #include "unicode/regex.h"
 #include "filestrm.h"
 #include "xmlparser.h"

 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_CONVERSION

 // character constants
 enum {
     x_QUOT=0x22,
     x_AMP=0x26,
     x_APOS=0x27,
     x_LT=0x3c,
     x_GT=0x3e,
     x_l=0x6c
 };

 #define  XML_SPACES "[ \\u0009\\u000d\\u000a]"

 // XML #4
 #define  XML_NAMESTARTCHAR "[[A-Z]:_[a-z][\\u00c0-\\u00d6][\\u00d8-\\u00f6]" \
                     "[\\u00f8-\\u02ff][\\u0370-\\u037d][\\u037F-\\u1FFF][\\u200C-\\u200D]" \
                     "[\\u2070-\\u218F][\\u2C00-\\u2FEF][\\u3001-\\uD7FF][\\uF900-\\uFDCF]" \
                     "[\\uFDF0-\\uFFFD][\\U00010000-\\U000EFFFF]]"

 //  XML #5
 #define  XML_NAMECHAR "[" XML_NAMESTARTCHAR "\\-.[0-9]\\u00b7[\\u0300-\\u036f][\\u203f-\\u2040]]"

 //  XML #6
 #define  XML_NAME    XML_NAMESTARTCHAR "(?:" XML_NAMECHAR ")*"

 U_NAMESPACE_BEGIN

 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UXMLParser)
 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UXMLElement)

 //
 //   UXMLParser constructor.   Mostly just initializes the ICU regexes that are
 //                             used for parsing.
 //
 UXMLParser::UXMLParser(UErrorCode &status) :
       //  XML Declaration.  XML Production #23.
       //      example:  "<?xml version=1.0 encoding="utf-16" ?>
       //      This is a sloppy implementation - just look for the leading <?xml and the closing ?>
       //            allow for a possible leading BOM.
       mXMLDecl(UnicodeString("(?s)\\uFEFF?<\\?xml.+?\\?>"), 0, status),

       //  XML Comment   production #15
       //     example:  "<!-- whatever -->
       //       note, does not detect an illegal "--" within comments
       mXMLComment(UnicodeString("(?s)<!--.+?-->"), 0, status),

       //  XML Spaces
       //      production [3]
       mXMLSP(UnicodeString(XML_SPACES "+"), 0, status),

       //  XML Doctype decl  production #28
       //     example   "<!DOCTYPE foo SYSTEM "somewhere" >
       //    TODO:  we don't actually parse the DOCTYPE or internal subsets.
       //           Some internal dtd subsets could confuse this simple-minded
       //           attempt at skipping over them.
       mXMLDoctype(UnicodeString("(?s)<!DOCTYPE.+?>"), 0, status),

       //  XML PI     production #16
       //     example   "<?target stuff?>
       mXMLPI(UnicodeString("(?s)<\\?.+?\\?>"), 0, status),

       //  XML Element Start   Productions #40, #41
       //          example   <foo att1='abc'  att2="d e f" >
       //      capture #1:  the tag name
       //
       mXMLElemStart (UnicodeString("(?s)<(" XML_NAME ")"                                 // match  "<tag_name"
           "(?:"
                 XML_SPACES "+" XML_NAME XML_SPACES "*=" XML_SPACES "*"     // match  "ATTR_NAME = "
                 "(?:(?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))"        // match  '"attribute value"'
           ")*"                                                             //   * for zero or more attributes.
           XML_SPACES "*?>"), 0, status),                               // match " >"

       //  XML Element End     production #42
       //     example   </foo>
       mXMLElemEnd (UnicodeString("</(" XML_NAME ")" XML_SPACES "*>"), 0, status),

       // XML Element Empty    production #44
       //     example   <foo att1="abc"   att2="d e f" />
       mXMLElemEmpty (UnicodeString("(?s)<(" XML_NAME ")"                                 // match  "<tag_name"
           "(?:"
                 XML_SPACES "+" XML_NAME XML_SPACES "*=" XML_SPACES "*"     // match  "ATTR_NAME = "
                 "(?:(?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))"        // match  '"attribute value"'
           ")*"                                                             //   * for zero or more attributes.
           XML_SPACES "*?/>"), 0, status),                              // match " />"


       // XMLCharData.  Everything but '<'.  Note that & will be dealt with later.
       mXMLCharData(UnicodeString("(?s)[^<]*"), 0, status),

       // Attribute name = "value".  XML Productions 10, 40/41
       //  Capture group 1 is name,
       //                2 is the attribute value, including the quotes.
       //
       //   Note that attributes are scanned twice.  The first time is with
       //        the regex for an entire element start.  There, the attributes
       //        are checked syntactically, but not separted out one by one.
       //        Here, we match a single attribute, and make its name and
       //        attribute value available to the parser code.
       mAttrValue(UnicodeString(XML_SPACES "+("  XML_NAME ")"  XML_SPACES "*=" XML_SPACES "*"
          "((?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))"), 0, status),


       mAttrNormalizer(UnicodeString(XML_SPACES), 0, status),

       // Match any of the new-line sequences in content.
       //   All are changed to \u000a.
       mNewLineNormalizer(UnicodeString("\\u000d\\u000a|\\u000d\\u0085|\\u000a|\\u000d|\\u0085|\\u2028"), 0, status),

       // & char references
       //   We will figure out what we've got based on which capture group has content.
       //   The last one is a catchall for unrecognized entity references..
       //             1     2     3      4      5           6                    7          8
       mAmps(UnicodeString("&(?:(amp;)|(lt;)|(gt;)|(apos;)|(quot;)|#x([0-9A-Fa-f]{1,8});|#([0-9]{1,8});|(.))"),
                 0, status),

       fNames(status),
       fElementStack(status),
       fOneLF((UChar)0x0a)        // Plain new-line string, used in new line normalization.
       {
       }

 UXMLParser *
 UXMLParser::createParser(UErrorCode &errorCode) {
     if (U_FAILURE(errorCode)) {
         return NULL;
     } else {
         return new UXMLParser(errorCode);
     }
 }

 UXMLParser::~UXMLParser() {}

 UXMLElement *
 UXMLParser::parseFile(const char *filename, UErrorCode &errorCode) {
     char bytes[4096], charsetBuffer[100];
     FileStream *f;
     const char *charset, *pb;
     UnicodeString src;
     UConverter *cnv;
     UChar *buffer, *pu;
     int32_t fileLength, bytesLength, length, capacity;
     UBool flush;

     if(U_FAILURE(errorCode)) {
         return NULL;
     }

     f=T_FileStream_open(filename, "rb");
     if(f==NULL) {
         errorCode=U_FILE_ACCESS_ERROR;
         return NULL;
     }

     bytesLength=T_FileStream_read(f, bytes, (int32_t)sizeof(bytes));
     if(bytesLength<(int32_t)sizeof(bytes)) {
         // we have already read the entire file
         fileLength=bytesLength;
     } else {
         // get the file length
         fileLength=T_FileStream_size(f);
     }

     /*
      * get the charset:
      * 1. Unicode signature
      * 2. treat as ISO-8859-1 and read XML encoding="charser"
      * 3. default to UTF-8
      */
     charset=ucnv_detectUnicodeSignature(bytes, bytesLength, NULL, &errorCode);
     if(U_SUCCESS(errorCode) && charset!=NULL) {
         // open converter according to Unicode signature
         cnv=ucnv_open(charset, &errorCode);
     } else {
         // read as Latin-1 and parse the XML declaration and encoding
         cnv=ucnv_open("ISO-8859-1", &errorCode);
         if(U_FAILURE(errorCode)) {
             // unexpected error opening Latin-1 converter
             goto exit;
         }

         buffer=src.getBuffer(bytesLength);
         if(buffer==NULL) {
             // unexpected failure to reserve some string capacity
             errorCode=U_MEMORY_ALLOCATION_ERROR;
             goto exit;
         }
         pb=bytes;
         pu=buffer;
         ucnv_toUnicode(
             cnv,
             &pu, buffer+src.getCapacity(),
             &pb, bytes+bytesLength,
             NULL, TRUE, &errorCode);
         src.releaseBuffer(U_SUCCESS(errorCode) ? (int32_t)(pu-buffer) : 0);
         ucnv_close(cnv);
         cnv=NULL;
         if(U_FAILURE(errorCode)) {
             // unexpected error in conversion from Latin-1
             src.remove();
             goto exit;
         }

         // parse XML declaration
         if(mXMLDecl.reset(src).lookingAt(0, errorCode)) {
             int32_t declEnd=mXMLDecl.end(errorCode);
             // go beyond <?xml
             int32_t pos=src.indexOf((UChar)x_l)+1;

             mAttrValue.reset(src);
             while(pos<declEnd && mAttrValue.lookingAt(pos, errorCode)) {  // loop runs once per attribute on this element.
                 UnicodeString attName  = mAttrValue.group(1, errorCode);
                 UnicodeString attValue = mAttrValue.group(2, errorCode);

                 // Trim the quotes from the att value.  These are left over from the original regex
                 //   that parsed the attribue, which couldn't conveniently strip them.
                 attValue.remove(0,1);                    // one char from the beginning
                 attValue.truncate(attValue.length()-1);  // and one from the end.

                 if(attName==UNICODE_STRING("encoding", 8)) {
                     length=attValue.extract(0, 0x7fffffff, charsetBuffer, (int32_t)sizeof(charsetBuffer));
                     charset=charsetBuffer;
                     break;
                 }
                 pos = mAttrValue.end(2, errorCode);
             }

             if(charset==NULL) {
                 // default to UTF-8
                 charset="UTF-8";
             }
             cnv=ucnv_open(charset, &errorCode);
         }
     }

     if(U_FAILURE(errorCode)) {
         // unable to open the converter
         goto exit;
     }

     // convert the file contents
     capacity=fileLength;        // estimated capacity
     src.getBuffer(capacity);
     src.releaseBuffer(0);       // zero length
     flush=FALSE;
     for(;;) {
         // convert contents of bytes[bytesLength]
         pb=bytes;
         for(;;) {
             length=src.length();
             buffer=src.getBuffer(capacity);
             if(buffer==NULL) {
                 // unexpected failure to reserve some string capacity
                 errorCode=U_MEMORY_ALLOCATION_ERROR;
                 goto exit;
             }

             pu=buffer+length;
             ucnv_toUnicode(
                 cnv, &pu, buffer+src.getCapacity(),
                 &pb, bytes+bytesLength,
                 NULL, FALSE, &errorCode);
             src.releaseBuffer(U_SUCCESS(errorCode) ? (int32_t)(pu-buffer) : 0);
             if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
                 errorCode=U_ZERO_ERROR;
                 capacity=(3*src.getCapacity())/2; // increase capacity by 50%
             } else {
                 break;
             }
         }

         if(U_FAILURE(errorCode)) {
             break; // conversion error
         }

         if(flush) {
             break; // completely converted the file
         }

         // read next block
         bytesLength=T_FileStream_read(f, bytes, (int32_t)sizeof(bytes));
         if(bytesLength==0) {
             // reached end of file, convert once more to flush the converter
             flush=TRUE;
         }
     };

 exit:
     ucnv_close(cnv);
     T_FileStream_close(f);

     if(U_SUCCESS(errorCode)) {
         return parse(src, errorCode);
     } else {
         return NULL;
     }
 }

 UXMLElement *
 UXMLParser::parse(const UnicodeString &src, UErrorCode &status) {
     if(U_FAILURE(status)) {
         return NULL;
     }

     UXMLElement   *root = NULL;
     fPos = 0; // TODO use just a local pos variable and pass it into functions
               // where necessary?

     // set all matchers to work on the input string
     mXMLDecl.reset(src);
     mXMLComment.reset(src);
     mXMLSP.reset(src);
     mXMLDoctype.reset(src);
     mXMLPI.reset(src);
     mXMLElemStart.reset(src);
     mXMLElemEnd.reset(src);
     mXMLElemEmpty.reset(src);
     mXMLCharData.reset(src);
     mAttrValue.reset(src);
     mAttrNormalizer.reset(src);
     mNewLineNormalizer.reset(src);
     mAmps.reset(src);

     // Consume the XML Declaration, if present.
     if (mXMLDecl.lookingAt(fPos, status)) {
         fPos = mXMLDecl.end(status);
     }

     // Consume "misc" [XML production 27] appearing before DocType
     parseMisc(status);

     // Consume a DocType declaration, if present.
     if (mXMLDoctype.lookingAt(fPos, status)) {
         fPos = mXMLDoctype.end(status);
     }

     // Consume additional "misc" [XML production 27] appearing after the DocType
     parseMisc(status);

     // Get the root element
     if (mXMLElemEmpty.lookingAt(fPos, status)) {
         // Root is an empty element (no nested elements or content)
         root = createElement(mXMLElemEmpty, status);
         fPos = mXMLElemEmpty.end(status);
     } else {
         if (mXMLElemStart.lookingAt(fPos, status) == FALSE) {
             error("Root Element expected", status);
             goto errorExit;
         }
         root = createElement(mXMLElemStart, status);
         UXMLElement  *el = root;

         //
         // This is the loop that consumes the root element of the document,
         //      including all nested content.   Nested elements are handled by
         //      explicit pushes/pops of the element stack; there is no recursion
         //      in the control flow of this code.
         //      "el" always refers to the current element, the one to which content
         //      is being added.  It is above the top of the element stack.
         for (;;) {
             // Nested Element Start
             if (mXMLElemStart.lookingAt(fPos, status)) {
                 UXMLElement *t = createElement(mXMLElemStart, status);
                 el->fChildren.addElement(t, status);
                 t->fParent = el;
                 fElementStack.push(el, status);
                 el = t;
                 continue;
             }

             // Text Content.  String is concatenated onto the current node's content,
             //                but only if it contains something other than spaces.
             UnicodeString s = scanContent(status);
             if (s.length() > 0) {
                 mXMLSP.reset(s);
                 if (mXMLSP.matches(status) == FALSE) {
                     // This chunk of text contains something other than just
                     //  white space. Make a child node for it.
                     replaceCharRefs(s, status);
                     el->fChildren.addElement(s.clone(), status);
                 }
                 mXMLSP.reset(src);    // The matchers need to stay set to the main input string.
                 continue;
             }

             // Comments.  Discard.
             if (mXMLComment.lookingAt(fPos, status)) {
                 fPos = mXMLComment.end(status);
                 continue;
             }

             // PIs.  Discard.
             if (mXMLPI.lookingAt(fPos, status)) {
                 fPos = mXMLPI.end(status);
                 continue;
             }

             // Element End
             if (mXMLElemEnd.lookingAt(fPos, status)) {
                 fPos = mXMLElemEnd.end(0, status);
                 const UnicodeString name = mXMLElemEnd.group(1, status);
                 if (name != *el->fName) {
                     error("Element start / end tag mismatch", status);
                     goto errorExit;
                 }
                 if (fElementStack.empty()) {
                     // Close of the root element.  We're done with the doc.
                     el = NULL;
                     break;
                 }
                 el = (UXMLElement *)fElementStack.pop();
                 continue;
             }

             // Empty Element.  Stored as a child of the current element, but not stacked.
             if (mXMLElemEmpty.lookingAt(fPos, status)) {
                 UXMLElement *t = createElement(mXMLElemEmpty, status);
                 el->fChildren.addElement(t, status);
                 continue;
             }

             // Hit something within the document that doesn't match anything.
             //   It's an error.
             error("Unrecognized markup", status);
             break;
         }

         if (el != NULL || !fElementStack.empty()) {
             // We bailed out early, for some reason.
             error("Root element not closed.", status);
             goto errorExit;
         }
     }

     // Root Element parse is complete.
     // Consume the annoying xml "Misc" that can appear at the end of the doc.
     parseMisc(status);

     // We should have reached the end of the input
     if (fPos != src.length()) {
         error("Extra content at the end of the document", status);
         goto errorExit;
     }

     // Success!
     return root;

 errorExit:
     delete root;
     return NULL;
 }

 //
 //  createElement
 //      We've just matched an element start tag.  Create and fill in a UXMLElement object
 //      for it.
 //
 UXMLElement *
 UXMLParser::createElement(RegexMatcher  &mEl, UErrorCode &status) {
     // First capture group is the element's name.
     UXMLElement *el = new UXMLElement(this, intern(mEl.group(1, status), status), status);

     // Scan for attributes.
     int32_t   pos = mEl.end(1, status);  // The position after the end of the tag name

     while (mAttrValue.lookingAt(pos, status)) {  // loop runs once per attribute on this element.
         UnicodeString attName  = mAttrValue.group(1, status);
         UnicodeString attValue = mAttrValue.group(2, status);

         // Trim the quotes from the att value.  These are left over from the original regex
         //   that parsed the attribue, which couldn't conveniently strip them.
         attValue.remove(0,1);                    // one char from the beginning
         attValue.truncate(attValue.length()-1);  // and one from the end.

         // XML Attribue value normalization.
         // This is one of the really screwy parts of the XML spec.
         // See http://www.w3.org/TR/2004/REC-xml11-20040204/#AVNormalize
         // Note that non-validating parsers must treat all entities as type CDATA
         //   which simplifies things some.

         // Att normalization step 1:  normalize any newlines in the attribute value
         mNewLineNormalizer.reset(attValue);
         attValue = mNewLineNormalizer.replaceAll(fOneLF, status);

         // Next change all xml white space chars to plain \u0020 spaces.
         mAttrNormalizer.reset(attValue);
         UnicodeString oneSpace((UChar)0x0020);
         attValue = mAttrNormalizer.replaceAll(oneSpace, status);

         // Replace character entities.
         replaceCharRefs(attValue, status);

         // Save the attribute name and value in our document structure.
         el->fAttNames.addElement((void *)intern(attName, status), status);
         el->fAttValues.addElement(attValue.clone(), status);
         pos = mAttrValue.end(2, status);
     }
     fPos = mEl.end(0, status);
     return el;
 }

 //
 //  parseMisc
 //     Consume XML "Misc" [production #27]
 //        which is any combination of space, PI and comments
 //      Need to watch end-of-input because xml MISC stuff is allowed after
 //        the document element, so we WILL scan off the end in this function
 //
 void
 UXMLParser::parseMisc(UErrorCode &status)  {
     for (;;) {
         if (fPos >= mXMLPI.input().length()) {
             break;
         }
         if (mXMLPI.lookingAt(fPos, status)) {
             fPos = mXMLPI.end(status);
             continue;
         }
         if (mXMLSP.lookingAt(fPos, status)) {
             fPos = mXMLSP.end(status);
             continue;
         }
         if (mXMLComment.lookingAt(fPos, status)) {
             fPos = mXMLComment.end(status);
             continue;
         }
         break;
     }
 }

 //
 //  Scan for document content.
 //
 UnicodeString
 UXMLParser::scanContent(UErrorCode &status) {
     UnicodeString  result;
     if (mXMLCharData.lookingAt(fPos, status)) {
         result = mXMLCharData.group(0, status);
         // Normalize the new-lines.  (Before char ref substitution)
         mNewLineNormalizer.reset(result);
         result = mNewLineNormalizer.replaceAll(fOneLF, status);

         // TODO:  handle CDATA
         fPos = mXMLCharData.end(0, status);
     }

     return result;
 }

 //
 //   replaceCharRefs
 //
 //      replace the char entities &lt;  &amp; &#123; &#x12ab; etc. in a string
 //       with the corresponding actual character.
 //
 void
 UXMLParser::replaceCharRefs(UnicodeString &s, UErrorCode &status) {
     UnicodeString result;
     UnicodeString replacement;
     int     i;

     mAmps.reset(s);
     // See the initialization for the regex matcher mAmps.
     //    Which entity we've matched is determined by which capture group has content,
     //      which is flaged by start() of that group not being -1.
     while (mAmps.find()) {
         if (mAmps.start(1, status) != -1) {
             replacement.setTo((UChar)x_AMP);
         } else if (mAmps.start(2, status) != -1) {
             replacement.setTo((UChar)x_LT);
         } else if (mAmps.start(3, status) != -1) {
             replacement.setTo((UChar)x_GT);
         } else if (mAmps.start(4, status) != -1) {
             replacement.setTo((UChar)x_APOS);
         } else if (mAmps.start(5, status) != -1) {
             replacement.setTo((UChar)x_QUOT);
         } else if (mAmps.start(6, status) != -1) {
             UnicodeString hexString = mAmps.group(6, status);
             UChar32 val = 0;
             for (i=0; i<hexString.length(); i++) {
                 val = (val << 4) + u_digit(hexString.charAt(i), 16);
             }
             // TODO:  some verification that the character is valid
             replacement.setTo(val);
         } else if (mAmps.start(7, status) != -1) {
             UnicodeString decimalString = mAmps.group(7, status);
             UChar32 val = 0;
             for (i=0; i<decimalString.length(); i++) {
                 val = val*10 + u_digit(decimalString.charAt(i), 10);
             }
             // TODO:  some verification that the character is valid
             replacement.setTo(val);
         } else {
             // An unrecognized &entity;  Leave it alone.
             //  TODO:  check that it really looks like an entity, and is not some
             //         random & in the text.
             replacement = mAmps.group(0, status);
         }
         mAmps.appendReplacement(result, replacement, status);
     }
     mAmps.appendTail(result);
     s = result;
 }

 void
 UXMLParser::error(const char *message, UErrorCode &status) {
     // TODO:  something better here...
     const UnicodeString &src=mXMLDecl.input();
     int  line = 0;
     int  ci = 0;
     while (ci < fPos && ci>=0) {
         ci = src.indexOf((UChar)0x0a, ci+1);
         line++;
     }
     fprintf(stderr, "Error: %s at line %d\n", message, line);
     if (U_SUCCESS(status)) {
         status = U_PARSE_ERROR;
     }
 }

 // intern strings like in Java

 const UnicodeString *
 UXMLParser::intern(const UnicodeString &s, UErrorCode &errorCode) {
     const UHashElement *he=fNames.find(s);
     if(he!=NULL) {
         // already a known name, return its hashed key pointer
         return (const UnicodeString *)he->key.pointer;
     } else {
         // add this new name and return its hashed key pointer
         fNames.puti(s, 0, errorCode);
         he=fNames.find(s);
         return (const UnicodeString *)he->key.pointer;
     }
 }

 const UnicodeString *
 UXMLParser::findName(const UnicodeString &s) const {
     const UHashElement *he=fNames.find(s);
     if(he!=NULL) {
         // a known name, return its hashed key pointer
         return (const UnicodeString *)he->key.pointer;
     } else {
         // unknown name
         return NULL;
     }
 }

 // UXMLElement ------------------------------------------------------------- ***

 UXMLElement::UXMLElement(const UXMLParser *parser, const UnicodeString *name, UErrorCode &errorCode) :
    fParser(parser),
    fName(name),
    fAttNames(errorCode),
    fAttValues(errorCode),
    fChildren(errorCode),
    fParent(NULL)
 {
 }

 UXMLElement::~UXMLElement() {
     int   i;
     // attribute names are owned by the UXMLParser, don't delete them here
     for (i=fAttValues.size()-1; i>=0; i--) {
         delete (UObject *)fAttValues.elementAt(i);
     }
     for (i=fChildren.size()-1; i>=0; i--) {
         delete (UObject *)fChildren.elementAt(i);
     }
 }

 const UnicodeString &
 UXMLElement::getTagName() const {
     return *fName;
 }

 UnicodeString
 UXMLElement::getText(UBool recurse) const {
     UnicodeString text;
     appendText(text, recurse);
     return text;
 }

 void
 UXMLElement::appendText(UnicodeString &text, UBool recurse) const {
     const UObject *node;
     int32_t i, count=fChildren.size();
     for(i=0; i<count; ++i) {
         node=(const UObject *)fChildren.elementAt(i);
         if(node->getDynamicClassID()==UnicodeString::getStaticClassID()) {
             text.append(*(const UnicodeString *)node);
         } else if(recurse) /* must be a UXMLElement */ {
             ((const UXMLElement *)node)->appendText(text, recurse);
         }
     }
 }

 int32_t
 UXMLElement::countAttributes() const {
     return fAttNames.size();
 }

 const UnicodeString *
 UXMLElement::getAttribute(int32_t i, UnicodeString &name, UnicodeString &value) const {
     if(0<=i && i<fAttNames.size()) {
         name.setTo(*(const UnicodeString *)fAttNames.elementAt(i));
         value.setTo(*(const UnicodeString *)fAttValues.elementAt(i));
         return &value; // or return (UnicodeString *)fAttValues.elementAt(i);
     } else {
         return NULL;
     }
 }

 const UnicodeString *
 UXMLElement::getAttribute(const UnicodeString &name) const {
     // search for the attribute name by comparing the interned pointer,
     // not the string contents
     const UnicodeString *p=fParser->findName(name);
     if(p==NULL) {
         return NULL; // no such attribute seen by the parser at all
     }

     int32_t i, count=fAttNames.size();
     for(i=0; i<count; ++i) {
         if(p==(const UnicodeString *)fAttNames.elementAt(i)) {
             return (const UnicodeString *)fAttValues.elementAt(i);
         }
     }
     return NULL;
 }

 int32_t
 UXMLElement::countChildren() const {
     return fChildren.size();
 }

 const UObject *
 UXMLElement::getChild(int32_t i, UXMLNodeType &type) const {
     if(0<=i && i<fChildren.size()) {
         const UObject *node=(const UObject *)fChildren.elementAt(i);
         if(node->getDynamicClassID()==UXMLElement::getStaticClassID()) {
             type=UXML_NODE_TYPE_ELEMENT;
         } else {
             type=UXML_NODE_TYPE_STRING;
         }
         return node;
     } else {
         return NULL;
     }
 }

 const UXMLElement *
 UXMLElement::nextChildElement(int32_t &i) const {
     if(i<0) {
         return NULL;
     }

     const UObject *node;
     int32_t count=fChildren.size();
     while(i<count) {
         node=(const UObject *)fChildren.elementAt(i++);
         // TODO: see if ICU can use C++ instanceof instead of its own poor man's RTTI
         // if(node instanceof UXMLElement) {
         if(node->getDynamicClassID()==UXMLElement::getStaticClassID()) {
             return (const UXMLElement *)node;
         }
     }
     return NULL;
 }

 const UXMLElement *
 UXMLElement::getChildElement(const UnicodeString &name) const {
     // search for the element name by comparing the interned pointer,
     // not the string contents
     const UnicodeString *p=fParser->findName(name);
     if(p==NULL) {
         return NULL; // no such element seen by the parser at all
     }

     const UObject *node;
     int32_t i, count=fChildren.size();
     for(i=0; i<count; ++i) {
         node=(const UObject *)fChildren.elementAt(i);
         if(node->getDynamicClassID()==UXMLElement::getStaticClassID()) {
             const UXMLElement *elem=(const UXMLElement *)node;
             if(p==elem->fName) {
                 return elem;
             }
         }
     }
     return NULL;
 }

 U_NAMESPACE_END

 #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */