/**************************************************************
 * 
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 * 
 *************************************************************/



#include "precompiled_xmlreader.hxx"
#include "sal/config.h"

#include <climits>
#include <cstddef>

#include "com/sun/star/container/NoSuchElementException.hpp"
#include "com/sun/star/uno/Reference.hxx"
#include "com/sun/star/uno/RuntimeException.hpp"
#include "com/sun/star/uno/XInterface.hpp"
#include "osl/diagnose.h"
#include "osl/file.h"
#include "rtl/string.h"
#include "rtl/ustring.h"
#include "rtl/ustring.hxx"
#include "sal/types.h"
#include "xmlreader/pad.hxx"
#include "xmlreader/span.hxx"
#include "xmlreader/xmlreader.hxx"

namespace xmlreader {

namespace {

namespace css = com::sun::star;

bool isSpace(char c) {
    switch (c) {
    case '\x09':
    case '\x0A':
    case '\x0D':
    case ' ':
        return true;
    default:
        return false;
    }
}

}

XmlReader::XmlReader(rtl::OUString const & fileUrl)
    SAL_THROW((
        css::container::NoSuchElementException, css::uno::RuntimeException)):
    fileUrl_(fileUrl)
{
    switch (osl_openFile(fileUrl_.pData, &fileHandle_, osl_File_OpenFlag_Read))
    {
    case osl_File_E_None:
        break;
    case osl_File_E_NOENT:
        throw css::container::NoSuchElementException(
            fileUrl_, css::uno::Reference< css::uno::XInterface >());
    default:
        throw css::uno::RuntimeException(
            (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("cannot open ")) +
             fileUrl_),
            css::uno::Reference< css::uno::XInterface >());
    }
    oslFileError e = osl_getFileSize(fileHandle_, &fileSize_);
    if (e == osl_File_E_None) {
        e = osl_mapFile(
            fileHandle_, &fileAddress_, fileSize_, 0,
            osl_File_MapFlag_WillNeed);
    }
    if (e != osl_File_E_None) {
        e = osl_closeFile(fileHandle_);
        if (e != osl_File_E_None) {
            OSL_TRACE("osl_closeFile failed with %ld", static_cast< long >(e));
        }
        throw css::uno::RuntimeException(
            (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("cannot mmap ")) +
             fileUrl_),
            css::uno::Reference< css::uno::XInterface >());
    }
    namespaceIris_.push_back(
        Span(
            RTL_CONSTASCII_STRINGPARAM(
                "http://www.w3.org/XML/1998/namespace")));
    namespaces_.push_back(
        NamespaceData(Span(RTL_CONSTASCII_STRINGPARAM("xml")), NAMESPACE_XML));
    pos_ = static_cast< char * >(fileAddress_);
    end_ = pos_ + fileSize_;
    state_ = STATE_CONTENT;
}

XmlReader::~XmlReader() {
    oslFileError e = osl_unmapFile(fileAddress_, fileSize_);
    if (e != osl_File_E_None) {
        OSL_TRACE("osl_unmapFile failed with %ld", static_cast< long >(e));
    }
    e = osl_closeFile(fileHandle_);
    if (e != osl_File_E_None) {
        OSL_TRACE("osl_closeFile failed with %ld", static_cast< long >(e));
    }
}

int XmlReader::registerNamespaceIri(Span const & iri) {
    int id = toNamespaceId(namespaceIris_.size());
    namespaceIris_.push_back(iri);
    if (iri.equals(
            Span(
                RTL_CONSTASCII_STRINGPARAM(
                    "http://www.w3.org/2001/XMLSchema-instance"))))
    {
        // Old user layer .xcu files used the xsi namespace prefix without
        // declaring a corresponding namespace binding, see issue 77174; reading
        // those files during migration would fail without this hack that can be
        // removed once migration is no longer relevant (see
        // configmgr::Components::parseModificationLayer):
        namespaces_.push_back(
            NamespaceData(Span(RTL_CONSTASCII_STRINGPARAM("xsi")), id));
    }
    return id;
}

XmlReader::Result XmlReader::nextItem(Text reportText, Span * data, int * nsId)
{
    switch (state_) {
    case STATE_CONTENT:
        switch (reportText) {
        case TEXT_NONE:
            return handleSkippedText(data, nsId);
        case TEXT_RAW:
            return handleRawText(data);
        case TEXT_NORMALIZED:
            return handleNormalizedText(data);
        }
    case STATE_START_TAG:
        return handleStartTag(nsId, data);
    case STATE_END_TAG:
        return handleEndTag();
    case STATE_EMPTY_ELEMENT_TAG:
        handleElementEnd();
        return RESULT_END;
    default: // STATE_DONE
        return RESULT_DONE;
    }
}

bool XmlReader::nextAttribute(int * nsId, Span * localName) {
    OSL_ASSERT(nsId != 0 && localName != 0);
    if (firstAttribute_) {
        currentAttribute_ = attributes_.begin();
        firstAttribute_ = false;
    } else {
        ++currentAttribute_;
    }
    if (currentAttribute_ == attributes_.end()) {
        return false;
    }
    if (currentAttribute_->nameColon == 0) {
        *nsId = NAMESPACE_NONE;
        *localName = Span(
            currentAttribute_->nameBegin,
            currentAttribute_->nameEnd - currentAttribute_->nameBegin);
    } else {
        *nsId = getNamespaceId(
            Span(
                currentAttribute_->nameBegin,
                currentAttribute_->nameColon - currentAttribute_->nameBegin));
        *localName = Span(
            currentAttribute_->nameColon + 1,
            currentAttribute_->nameEnd - (currentAttribute_->nameColon + 1));
    }
    return true;
}

Span XmlReader::getAttributeValue(bool fullyNormalize) {
    return handleAttributeValue(
        currentAttribute_->valueBegin, currentAttribute_->valueEnd,
        fullyNormalize);
}

int XmlReader::getNamespaceId(Span const & prefix) const {
    for (NamespaceList::const_reverse_iterator i(namespaces_.rbegin());
         i != namespaces_.rend(); ++i)
    {
        if (prefix.equals(i->prefix)) {
            return i->nsId;
        }
    }
    return NAMESPACE_UNKNOWN;
}

rtl::OUString XmlReader::getUrl() const {
    return fileUrl_;
}

void XmlReader::normalizeLineEnds(Span const & text) {
    char const * p = text.begin;
    sal_Int32 n = text.length;
    for (;;) {
        sal_Int32 i = rtl_str_indexOfChar_WithLength(p, n, '\x0D');
        if (i < 0) {
            break;
        }
        pad_.add(p, i);
        p += i + 1;
        n -= i + 1;
        if (n == 0 || *p != '\x0A') {
            pad_.add(RTL_CONSTASCII_STRINGPARAM("\x0A"));
        }
    }
    pad_.add(p, n);
}

void XmlReader::skipSpace() {
    while (isSpace(peek())) {
        ++pos_;
    }
}

bool XmlReader::skipComment() {
    if (rtl_str_shortenedCompare_WithLength(
            pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("--"),
            RTL_CONSTASCII_LENGTH("--")) !=
        0)
    {
        return false;
    }
    pos_ += RTL_CONSTASCII_LENGTH("--");
    sal_Int32 i = rtl_str_indexOfStr_WithLength(
        pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("--"));
    if (i < 0) {
        throw css::uno::RuntimeException(
            (rtl::OUString(
                RTL_CONSTASCII_USTRINGPARAM(
                    "premature end (within comment) of ")) +
             fileUrl_),
            css::uno::Reference< css::uno::XInterface >());
    }
    pos_ += i + RTL_CONSTASCII_LENGTH("--");
    if (read() != '>') {
        throw css::uno::RuntimeException(
            (rtl::OUString(
                RTL_CONSTASCII_USTRINGPARAM(
                    "illegal \"--\" within comment in ")) +
             fileUrl_),
            css::uno::Reference< css::uno::XInterface >());
    }
    return true;
}

void XmlReader::skipProcessingInstruction() {
    sal_Int32 i = rtl_str_indexOfStr_WithLength(
        pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("?>"));
    if (i < 0) {
        throw css::uno::RuntimeException(
            (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("bad '<?' in ")) +
             fileUrl_),
            css::uno::Reference< css::uno::XInterface >());
    }
    pos_ += i + RTL_CONSTASCII_LENGTH("?>");
}

void XmlReader::skipDocumentTypeDeclaration() {
    // Neither is it checked that the doctypedecl is at the correct position in
    // the document, nor that it is well-formed:
    for (;;) {
        char c = read();
        switch (c) {
        case '\0': // i.e., EOF
            throw css::uno::RuntimeException(
                (rtl::OUString(
                    RTL_CONSTASCII_USTRINGPARAM(
                        "premature end (within DTD) of ")) +
                 fileUrl_),
                css::uno::Reference< css::uno::XInterface >());
        case '"':
        case '\'':
            {
                sal_Int32 i = rtl_str_indexOfChar_WithLength(
                    pos_, end_ - pos_, c);
                if (i < 0) {
                    throw css::uno::RuntimeException(
                        (rtl::OUString(
                            RTL_CONSTASCII_USTRINGPARAM(
                                "premature end (within DTD) of ")) +
                         fileUrl_),
                        css::uno::Reference< css::uno::XInterface >());
                }
                pos_ += i + 1;
            }
            break;
        case '>':
            return;
        case '[':
            for (;;) {
                c = read();
                switch (c) {
                case '\0': // i.e., EOF
                    throw css::uno::RuntimeException(
                        (rtl::OUString(
                            RTL_CONSTASCII_USTRINGPARAM(
                                "premature end (within DTD) of ")) +
                         fileUrl_),
                        css::uno::Reference< css::uno::XInterface >());
                case '"':
                case '\'':
                    {
                        sal_Int32 i = rtl_str_indexOfChar_WithLength(
                            pos_, end_ - pos_, c);
                        if (i < 0) {
                            throw css::uno::RuntimeException(
                            (rtl::OUString(
                                RTL_CONSTASCII_USTRINGPARAM(
                                    "premature end (within DTD) of ")) +
                             fileUrl_),
                            css::uno::Reference< css::uno::XInterface >());
                        }
                        pos_ += i + 1;
                    }
                    break;
                case '<':
                    switch (read()) {
                    case '\0': // i.e., EOF
                        throw css::uno::RuntimeException(
                            (rtl::OUString(
                                RTL_CONSTASCII_USTRINGPARAM(
                                    "premature end (within DTD) of ")) +
                             fileUrl_),
                            css::uno::Reference< css::uno::XInterface >());
                    case '!':
                        skipComment();
                        break;
                    case '?':
                        skipProcessingInstruction();
                        break;
                    default:
                        break;
                    }
                    break;
                case ']':
                    skipSpace();
                    if (read() != '>') {
                        throw css::uno::RuntimeException(
                            (rtl::OUString(
                                RTL_CONSTASCII_USTRINGPARAM(
                                    "missing \">\" of DTD in ")) +
                             fileUrl_),
                            css::uno::Reference< css::uno::XInterface >());
                    }
                    return;
                default:
                    break;
                }
            }
        default:
            break;
        }
    }
}

Span XmlReader::scanCdataSection() {
    if (rtl_str_shortenedCompare_WithLength(
            pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("[CDATA["),
            RTL_CONSTASCII_LENGTH("[CDATA[")) !=
        0)
    {
        return Span();
    }
    pos_ += RTL_CONSTASCII_LENGTH("[CDATA[");
    char const * begin = pos_;
    sal_Int32 i = rtl_str_indexOfStr_WithLength(
        pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("]]>"));
    if (i < 0) {
        throw css::uno::RuntimeException(
            (rtl::OUString(
                RTL_CONSTASCII_USTRINGPARAM(
                    "premature end (within CDATA section) of ")) +
             fileUrl_),
            css::uno::Reference< css::uno::XInterface >());
    }
    pos_ += i + RTL_CONSTASCII_LENGTH("]]>");
    return Span(begin, i);
}

bool XmlReader::scanName(char const ** nameColon) {
    OSL_ASSERT(nameColon != 0 && *nameColon == 0);
    for (char const * begin = pos_;; ++pos_) {
        switch (peek()) {
        case '\0': // i.e., EOF
        case '\x09':
        case '\x0A':
        case '\x0D':
        case ' ':
        case '/':
        case '=':
        case '>':
            return pos_ != begin;
        case ':':
            *nameColon = pos_;
            break;
        default:
            break;
        }
    }
}

int XmlReader::scanNamespaceIri(char const * begin, char const * end) {
    OSL_ASSERT(begin != 0 && begin <= end);
    Span iri(handleAttributeValue(begin, end, false));
    for (NamespaceIris::size_type i = 0; i < namespaceIris_.size(); ++i) {
        if (namespaceIris_[i].equals(iri)) {
            return toNamespaceId(i);
        }
    }
    return XmlReader::NAMESPACE_UNKNOWN;
}

char const * XmlReader::handleReference(char const * position, char const * end)
{
    OSL_ASSERT(position != 0 && *position == '&' && position < end);
    ++position;
    if (*position == '#') {
        ++position;
        sal_Int32 val = 0;
        char const * p;
        if (*position == 'x') {
            ++position;
            p = position;
            for (;; ++position) {
                char c = *position;
                if (c >= '0' && c <= '9') {
                    val = 16 * val + (c - '0');
                } else if (c >= 'A' && c <= 'F') {
                    val = 16 * val + (c - 'A') + 10;
                } else if (c >= 'a' && c <= 'f') {
                    val = 16 * val + (c - 'a') + 10;
                } else {
                    break;
                }
                if (val > 0x10FFFF) { // avoid overflow
                    throw css::uno::RuntimeException(
                        (rtl::OUString(
                            RTL_CONSTASCII_USTRINGPARAM(
                                "'&#x...' too large in ")) +
                         fileUrl_),
                        css::uno::Reference< css::uno::XInterface >());
                }
            }
        } else {
            p = position;
            for (;; ++position) {
                char c = *position;
                if (c >= '0' && c <= '9') {
                    val = 10 * val + (c - '0');
                } else {
                    break;
                }
                if (val > 0x10FFFF) { // avoid overflow
                    throw css::uno::RuntimeException(
                        (rtl::OUString(
                            RTL_CONSTASCII_USTRINGPARAM(
                                "'&#...' too large in ")) +
                         fileUrl_),
                        css::uno::Reference< css::uno::XInterface >());
                }
            }
        }
        if (position == p || *position++ != ';') {
            throw css::uno::RuntimeException(
                (rtl::OUString(
                    RTL_CONSTASCII_USTRINGPARAM("'&#...' missing ';' in ")) +
                 fileUrl_),
                css::uno::Reference< css::uno::XInterface >());
        }
        OSL_ASSERT(val >= 0 && val <= 0x10FFFF);
        if ((val < 0x20 && val != 0x9 && val != 0xA && val != 0xD) ||
            (val >= 0xD800 && val <= 0xDFFF) || val == 0xFFFE || val == 0xFFFF)
        {
            throw css::uno::RuntimeException(
                (rtl::OUString(
                    RTL_CONSTASCII_USTRINGPARAM(
                        "character reference denoting invalid character in ")) +
                 fileUrl_),
                css::uno::Reference< css::uno::XInterface >());
        }
        char buf[4];
        sal_Int32 len;
        if (val < 0x80) {
            buf[0] = static_cast< char >(val);
            len = 1;
        } else if (val < 0x800) {
            buf[0] = static_cast< char >((val >> 6) | 0xC0);
            buf[1] = static_cast< char >((val & 0x3F) | 0x80);
            len = 2;
        } else if (val < 0x10000) {
            buf[0] = static_cast< char >((val >> 12) | 0xE0);
            buf[1] = static_cast< char >(((val >> 6) & 0x3F) | 0x80);
            buf[2] = static_cast< char >((val & 0x3F) | 0x80);
            len = 3;
        } else {
            buf[0] = static_cast< char >((val >> 18) | 0xF0);
            buf[1] = static_cast< char >(((val >> 12) & 0x3F) | 0x80);
            buf[2] = static_cast< char >(((val >> 6) & 0x3F) | 0x80);
            buf[3] = static_cast< char >((val & 0x3F) | 0x80);
            len = 4;
        }
        pad_.addEphemeral(buf, len);
        return position;
    } else {
        struct EntityRef {
            char const * inBegin;
            sal_Int32 inLength;
            char const * outBegin;
            sal_Int32 outLength;
        };
        static EntityRef const refs[] = {
            { RTL_CONSTASCII_STRINGPARAM("amp;"),
              RTL_CONSTASCII_STRINGPARAM("&") },
            { RTL_CONSTASCII_STRINGPARAM("lt;"),
              RTL_CONSTASCII_STRINGPARAM("<") },
            { RTL_CONSTASCII_STRINGPARAM("gt;"),
              RTL_CONSTASCII_STRINGPARAM(">") },
            { RTL_CONSTASCII_STRINGPARAM("apos;"),
              RTL_CONSTASCII_STRINGPARAM("'") },
            { RTL_CONSTASCII_STRINGPARAM("quot;"),
              RTL_CONSTASCII_STRINGPARAM("\"") } };
        for (std::size_t i = 0; i < sizeof refs / sizeof refs[0]; ++i) {
            if (rtl_str_shortenedCompare_WithLength(
                    position, end - position, refs[i].inBegin, refs[i].inLength,
                    refs[i].inLength) ==
                0)
            {
                position += refs[i].inLength;
                pad_.add(refs[i].outBegin, refs[i].outLength);
                return position;
            }
        }
        throw css::uno::RuntimeException(
            (rtl::OUString(
                RTL_CONSTASCII_USTRINGPARAM("unknown entity reference in ")) +
             fileUrl_),
            css::uno::Reference< css::uno::XInterface >());
    }
}

Span XmlReader::handleAttributeValue(
    char const * begin, char const * end, bool fullyNormalize)
{
    pad_.clear();
    if (fullyNormalize) {
        while (begin != end && isSpace(*begin)) {
            ++begin;
        }
        while (end != begin && isSpace(end[-1])) {
            --end;
        }
        char const * p = begin;
        enum Space { SPACE_NONE, SPACE_SPAN, SPACE_BREAK };
            // a single true space character can go into the current span,
            // everything else breaks the span
        Space space = SPACE_NONE;
        while (p != end) {
            switch (*p) {
            case '\x09':
            case '\x0A':
            case '\x0D':
                switch (space) {
                case SPACE_NONE:
                    pad_.add(begin, p - begin);
                    pad_.add(RTL_CONSTASCII_STRINGPARAM(" "));
                    space = SPACE_BREAK;
                    break;
                case SPACE_SPAN:
                    pad_.add(begin, p - begin);
                    space = SPACE_BREAK;
                    break;
                case SPACE_BREAK:
                    break;
                }
                begin = ++p;
                break;
            case ' ':
                switch (space) {
                case SPACE_NONE:
                    ++p;
                    space = SPACE_SPAN;
                    break;
                case SPACE_SPAN:
                    pad_.add(begin, p - begin);
                    begin = ++p;
                    space = SPACE_BREAK;
                    break;
                case SPACE_BREAK:
                    begin = ++p;
                    break;
                }
                break;
            case '&':
                pad_.add(begin, p - begin);
                p = handleReference(p, end);
                begin = p;
                space = SPACE_NONE;
                break;
            default:
                ++p;
                space = SPACE_NONE;
                break;
            }
        }
        pad_.add(begin, p - begin);
    } else {
        char const * p = begin;
        while (p != end) {
            switch (*p) {
            case '\x09':
            case '\x0A':
                pad_.add(begin, p - begin);
                begin = ++p;
                pad_.add(RTL_CONSTASCII_STRINGPARAM(" "));
                break;
            case '\x0D':
                pad_.add(begin, p - begin);
                ++p;
                if (peek() == '\x0A') {
                    ++p;
                }
                begin = p;
                pad_.add(RTL_CONSTASCII_STRINGPARAM(" "));
                break;
            case '&':
                pad_.add(begin, p - begin);
                p = handleReference(p, end);
                begin = p;
                break;
            default:
                ++p;
                break;
            }
        }
        pad_.add(begin, p - begin);
    }
    return pad_.get();
}

XmlReader::Result XmlReader::handleStartTag(int * nsId, Span * localName) {
    OSL_ASSERT(nsId != 0 && localName);
    char const * nameBegin = pos_;
    char const * nameColon = 0;
    if (!scanName(&nameColon)) {
        throw css::uno::RuntimeException(
            (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("bad tag name in ")) +
             fileUrl_),
            css::uno::Reference< css::uno::XInterface >());
    }
    char const * nameEnd = pos_;
    NamespaceList::size_type inheritedNamespaces = namespaces_.size();
    bool hasDefaultNs = false;
    int defaultNsId = NAMESPACE_NONE;
    attributes_.clear();
    for (;;) {
        char const * p = pos_;
        skipSpace();
        if (peek() == '/' || peek() == '>') {
            break;
        }
        if (pos_ == p) {
            throw css::uno::RuntimeException(
                (rtl::OUString(
                    RTL_CONSTASCII_USTRINGPARAM(
                        "missing whitespace before attribute in ")) +
                 fileUrl_),
                css::uno::Reference< css::uno::XInterface >());
        }
        char const * attrNameBegin = pos_;
        char const * attrNameColon = 0;
        if (!scanName(&attrNameColon)) {
            throw css::uno::RuntimeException(
                (rtl::OUString(
                    RTL_CONSTASCII_USTRINGPARAM("bad attribute name in ")) +
                 fileUrl_),
                css::uno::Reference< css::uno::XInterface >());
        }
        char const * attrNameEnd = pos_;
        skipSpace();
        if (read() != '=') {
            throw css::uno::RuntimeException(
                (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("missing '=' in ")) +
                 fileUrl_),
                css::uno::Reference< css::uno::XInterface >());
        }
        skipSpace();
        char del = read();
        if (del != '\'' && del != '"') {
            throw css::uno::RuntimeException(
                (rtl::OUString(
                    RTL_CONSTASCII_USTRINGPARAM("bad attribute value in ")) +
                 fileUrl_),
                css::uno::Reference< css::uno::XInterface >());
        }
        char const * valueBegin = pos_;
        sal_Int32 i = rtl_str_indexOfChar_WithLength(pos_, end_ - pos_, del);
        if (i < 0) {
            throw css::uno::RuntimeException(
                (rtl::OUString(
                    RTL_CONSTASCII_USTRINGPARAM(
                        "unterminated attribute value in ")) +
                 fileUrl_),
                css::uno::Reference< css::uno::XInterface >());
        }
        char const * valueEnd = pos_ + i;
        pos_ += i + 1;
        if (attrNameColon == 0 &&
            Span(attrNameBegin, attrNameEnd - attrNameBegin).equals(
                RTL_CONSTASCII_STRINGPARAM("xmlns")))
        {
            hasDefaultNs = true;
            defaultNsId = scanNamespaceIri(valueBegin, valueEnd);
        } else if (attrNameColon != 0 &&
                   Span(attrNameBegin, attrNameColon - attrNameBegin).equals(
                       RTL_CONSTASCII_STRINGPARAM("xmlns")))
        {
            namespaces_.push_back(
                NamespaceData(
                    Span(attrNameColon + 1, attrNameEnd - (attrNameColon + 1)),
                    scanNamespaceIri(valueBegin, valueEnd)));
        } else {
            attributes_.push_back(
                AttributeData(
                    attrNameBegin, attrNameEnd, attrNameColon, valueBegin,
                    valueEnd));
        }
    }
    if (!hasDefaultNs && !elements_.empty()) {
        defaultNsId = elements_.top().defaultNamespaceId;
    }
    firstAttribute_ = true;
    if (peek() == '/') {
        state_ = STATE_EMPTY_ELEMENT_TAG;
        ++pos_;
    } else {
        state_ = STATE_CONTENT;
    }
    if (peek() != '>') {
        throw css::uno::RuntimeException(
            (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("missing '>' in ")) +
             fileUrl_),
            css::uno::Reference< css::uno::XInterface >());
    }
    ++pos_;
    elements_.push(
        ElementData(
            Span(nameBegin, nameEnd - nameBegin), inheritedNamespaces,
            defaultNsId));
    if (nameColon == 0) {
        *nsId = defaultNsId;
        *localName = Span(nameBegin, nameEnd - nameBegin);
    } else {
        *nsId = getNamespaceId(Span(nameBegin, nameColon - nameBegin));
        *localName = Span(nameColon + 1, nameEnd - (nameColon + 1));
    }
    return RESULT_BEGIN;
}

XmlReader::Result XmlReader::handleEndTag() {
    if (elements_.empty()) {
        throw css::uno::RuntimeException(
            (rtl::OUString(
                RTL_CONSTASCII_USTRINGPARAM("spurious end tag in ")) +
             fileUrl_),
            css::uno::Reference< css::uno::XInterface >());
    }
    char const * nameBegin = pos_;
    char const * nameColon = 0;
    if (!scanName(&nameColon) ||
        !elements_.top().name.equals(nameBegin, pos_ - nameBegin))
    {
        throw css::uno::RuntimeException(
            (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("tag mismatch in ")) +
             fileUrl_),
            css::uno::Reference< css::uno::XInterface >());
    }
    handleElementEnd();
    skipSpace();
    if (peek() != '>') {
        throw css::uno::RuntimeException(
            (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("missing '>' in ")) +
             fileUrl_),
            css::uno::Reference< css::uno::XInterface >());
    }
    ++pos_;
    return RESULT_END;
}

void XmlReader::handleElementEnd() {
    OSL_ASSERT(!elements_.empty());
    namespaces_.resize(elements_.top().inheritedNamespaces);
    elements_.pop();
    state_ = elements_.empty() ? STATE_DONE : STATE_CONTENT;
}

XmlReader::Result XmlReader::handleSkippedText(Span * data, int * nsId) {
    for (;;) {
        sal_Int32 i = rtl_str_indexOfChar_WithLength(pos_, end_ - pos_, '<');
        if (i < 0) {
            throw css::uno::RuntimeException(
                (rtl::OUString(
                    RTL_CONSTASCII_USTRINGPARAM("premature end of ")) +
                 fileUrl_),
                css::uno::Reference< css::uno::XInterface >());
        }
        pos_ += i + 1;
        switch (peek()) {
        case '!':
            ++pos_;
            if (!skipComment() && !scanCdataSection().is()) {
                skipDocumentTypeDeclaration();
            }
            break;
        case '/':
            ++pos_;
            return handleEndTag();
        case '?':
            ++pos_;
            skipProcessingInstruction();
            break;
        default:
            return handleStartTag(nsId, data);
        }
    }
}

XmlReader::Result XmlReader::handleRawText(Span * text) {
    pad_.clear();
    for (char const * begin = pos_;;) {
        switch (peek()) {
        case '\0': // i.e., EOF
            throw css::uno::RuntimeException(
                (rtl::OUString(
                    RTL_CONSTASCII_USTRINGPARAM("premature end of ")) +
                 fileUrl_),
                css::uno::Reference< css::uno::XInterface >());
        case '\x0D':
            pad_.add(begin, pos_ - begin);
            ++pos_;
            if (peek() != '\x0A') {
                pad_.add(RTL_CONSTASCII_STRINGPARAM("\x0A"));
            }
            begin = pos_;
            break;
        case '&':
            pad_.add(begin, pos_ - begin);
            pos_ = handleReference(pos_, end_);
            begin = pos_;
            break;
        case '<':
            pad_.add(begin, pos_ - begin);
            ++pos_;
            switch (peek()) {
            case '!':
                ++pos_;
                if (!skipComment()) {
                    Span cdata(scanCdataSection());
                    if (cdata.is()) {
                        normalizeLineEnds(cdata);
                    } else {
                        skipDocumentTypeDeclaration();
                    }
                }
                begin = pos_;
                break;
            case '/':
                *text = pad_.get();
                ++pos_;
                state_ = STATE_END_TAG;
                return RESULT_TEXT;
            case '?':
                ++pos_;
                skipProcessingInstruction();
                begin = pos_;
                break;
            default:
                *text = pad_.get();
                state_ = STATE_START_TAG;
                return RESULT_TEXT;
            }
            break;
        default:
            ++pos_;
            break;
        }
    }
}

XmlReader::Result XmlReader::handleNormalizedText(Span * text) {
    pad_.clear();
    char const * flowBegin = pos_;
    char const * flowEnd = pos_;
    enum Space { SPACE_START, SPACE_NONE, SPACE_SPAN, SPACE_BREAK };
        // a single true space character can go into the current flow,
        // everything else breaks the flow
    Space space = SPACE_START;
    for (;;) {
        switch (peek()) {
        case '\0': // i.e., EOF
            throw css::uno::RuntimeException(
                (rtl::OUString(
                    RTL_CONSTASCII_USTRINGPARAM("premature end of ")) +
                 fileUrl_),
                css::uno::Reference< css::uno::XInterface >());
        case '\x09':
        case '\x0A':
        case '\x0D':
            switch (space) {
            case SPACE_START:
            case SPACE_BREAK:
                break;
            case SPACE_NONE:
            case SPACE_SPAN:
                space = SPACE_BREAK;
                break;
            }
            ++pos_;
            break;
        case ' ':
            switch (space) {
            case SPACE_START:
            case SPACE_BREAK:
                break;
            case SPACE_NONE:
                space = SPACE_SPAN;
                break;
            case SPACE_SPAN:
                space = SPACE_BREAK;
                break;
            }
            ++pos_;
            break;
        case '&':
            switch (space) {
            case SPACE_START:
                break;
            case SPACE_NONE:
            case SPACE_SPAN:
                pad_.add(flowBegin, pos_ - flowBegin);
                break;
            case SPACE_BREAK:
                pad_.add(flowBegin, flowEnd - flowBegin);
                pad_.add(RTL_CONSTASCII_STRINGPARAM(" "));
                break;
            }
            pos_ = handleReference(pos_, end_);
            flowBegin = pos_;
            flowEnd = pos_;
            space = SPACE_NONE;
            break;
        case '<':
            ++pos_;
            switch (peek()) {
            case '!':
                ++pos_;
                if (skipComment()) {
                    space = SPACE_BREAK;
                } else {
                    Span cdata(scanCdataSection());
                    if (cdata.is()) {
                        // CDATA is not normalized (similar to character
                        // references; it keeps the code simple), but it might
                        // arguably be better to normalize it:
                        switch (space) {
                        case SPACE_START:
                            break;
                        case SPACE_NONE:
                        case SPACE_SPAN:
                            pad_.add(flowBegin, pos_ - flowBegin);
                            break;
                        case SPACE_BREAK:
                            pad_.add(flowBegin, flowEnd - flowBegin);
                            pad_.add(RTL_CONSTASCII_STRINGPARAM(" "));
                            break;
                        }
                        normalizeLineEnds(cdata);
                        flowBegin = pos_;
                        flowEnd = pos_;
                        space = SPACE_NONE;
                    } else {
                        skipDocumentTypeDeclaration();
                    }
                }
                break;
            case '/':
                ++pos_;
                pad_.add(flowBegin, flowEnd - flowBegin);
                *text = pad_.get();
                state_ = STATE_END_TAG;
                return RESULT_TEXT;
            case '?':
                ++pos_;
                skipProcessingInstruction();
                space = SPACE_BREAK;
                break;
            default:
                pad_.add(flowBegin, flowEnd - flowBegin);
                *text = pad_.get();
                state_ = STATE_START_TAG;
                return RESULT_TEXT;
            }
            break;
        default:
            switch (space) {
            case SPACE_START:
                flowBegin = pos_;
                break;
            case SPACE_NONE:
            case SPACE_SPAN:
                break;
            case SPACE_BREAK:
                pad_.add(flowBegin, flowEnd - flowBegin);
                pad_.add(RTL_CONSTASCII_STRINGPARAM(" "));
                flowBegin = pos_;
                break;
            }
            flowEnd = ++pos_;
            space = SPACE_NONE;
            break;
        }
    }
}

int XmlReader::toNamespaceId(NamespaceIris::size_type pos) {
    OSL_ASSERT(pos <= INT_MAX);
    return static_cast< int >(pos);
}

}
