xref: /AOO41X/main/xmlreader/source/xmlreader.cxx (revision cdf0e10c4e3984b49a9502b011690b615761d4a3)
1 /*************************************************************************
2 *
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * Copyright 2000, 2010 Oracle and/or its affiliates.
6 *
7 * OpenOffice.org - a multi-platform office productivity suite
8 *
9 * This file is part of OpenOffice.org.
10 *
11 * OpenOffice.org is free software: you can redistribute it and/or modify
12 * it under the terms of the GNU Lesser General Public License version 3
13 * only, as published by the Free Software Foundation.
14 *
15 * OpenOffice.org is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18 * GNU Lesser General Public License version 3 for more details
19 * (a copy is included in the LICENSE file that accompanied this code).
20 *
21 * You should have received a copy of the GNU Lesser General Public License
22 * version 3 along with OpenOffice.org.  If not, see
23 * <http://www.openoffice.org/license.html>
24 * for a copy of the LGPLv3 License.
25 *
26 ************************************************************************/
27 
28 #include "precompiled_xmlreader.hxx"
29 #include "sal/config.h"
30 
31 #include <climits>
32 #include <cstddef>
33 
34 #include "com/sun/star/container/NoSuchElementException.hpp"
35 #include "com/sun/star/uno/Reference.hxx"
36 #include "com/sun/star/uno/RuntimeException.hpp"
37 #include "com/sun/star/uno/XInterface.hpp"
38 #include "osl/diagnose.h"
39 #include "osl/file.h"
40 #include "rtl/string.h"
41 #include "rtl/ustring.h"
42 #include "rtl/ustring.hxx"
43 #include "sal/types.h"
44 #include "xmlreader/pad.hxx"
45 #include "xmlreader/span.hxx"
46 #include "xmlreader/xmlreader.hxx"
47 
48 namespace xmlreader {
49 
50 namespace {
51 
52 namespace css = com::sun::star;
53 
54 bool isSpace(char c) {
55     switch (c) {
56     case '\x09':
57     case '\x0A':
58     case '\x0D':
59     case ' ':
60         return true;
61     default:
62         return false;
63     }
64 }
65 
66 }
67 
68 XmlReader::XmlReader(rtl::OUString const & fileUrl)
69     SAL_THROW((
70         css::container::NoSuchElementException, css::uno::RuntimeException)):
71     fileUrl_(fileUrl)
72 {
73     switch (osl_openFile(fileUrl_.pData, &fileHandle_, osl_File_OpenFlag_Read))
74     {
75     case osl_File_E_None:
76         break;
77     case osl_File_E_NOENT:
78         throw css::container::NoSuchElementException(
79             fileUrl_, css::uno::Reference< css::uno::XInterface >());
80     default:
81         throw css::uno::RuntimeException(
82             (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("cannot open ")) +
83              fileUrl_),
84             css::uno::Reference< css::uno::XInterface >());
85     }
86     oslFileError e = osl_getFileSize(fileHandle_, &fileSize_);
87     if (e == osl_File_E_None) {
88         e = osl_mapFile(
89             fileHandle_, &fileAddress_, fileSize_, 0,
90             osl_File_MapFlag_WillNeed);
91     }
92     if (e != osl_File_E_None) {
93         e = osl_closeFile(fileHandle_);
94         if (e != osl_File_E_None) {
95             OSL_TRACE("osl_closeFile failed with %ld", static_cast< long >(e));
96         }
97         throw css::uno::RuntimeException(
98             (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("cannot mmap ")) +
99              fileUrl_),
100             css::uno::Reference< css::uno::XInterface >());
101     }
102     namespaceIris_.push_back(
103         Span(
104             RTL_CONSTASCII_STRINGPARAM(
105                 "http://www.w3.org/XML/1998/namespace")));
106     namespaces_.push_back(
107         NamespaceData(Span(RTL_CONSTASCII_STRINGPARAM("xml")), NAMESPACE_XML));
108     pos_ = static_cast< char * >(fileAddress_);
109     end_ = pos_ + fileSize_;
110     state_ = STATE_CONTENT;
111 }
112 
113 XmlReader::~XmlReader() {
114     oslFileError e = osl_unmapFile(fileAddress_, fileSize_);
115     if (e != osl_File_E_None) {
116         OSL_TRACE("osl_unmapFile failed with %ld", static_cast< long >(e));
117     }
118     e = osl_closeFile(fileHandle_);
119     if (e != osl_File_E_None) {
120         OSL_TRACE("osl_closeFile failed with %ld", static_cast< long >(e));
121     }
122 }
123 
124 int XmlReader::registerNamespaceIri(Span const & iri) {
125     int id = toNamespaceId(namespaceIris_.size());
126     namespaceIris_.push_back(iri);
127     if (iri.equals(
128             Span(
129                 RTL_CONSTASCII_STRINGPARAM(
130                     "http://www.w3.org/2001/XMLSchema-instance"))))
131     {
132         // Old user layer .xcu files used the xsi namespace prefix without
133         // declaring a corresponding namespace binding, see issue 77174; reading
134         // those files during migration would fail without this hack that can be
135         // removed once migration is no longer relevant (see
136         // configmgr::Components::parseModificationLayer):
137         namespaces_.push_back(
138             NamespaceData(Span(RTL_CONSTASCII_STRINGPARAM("xsi")), id));
139     }
140     return id;
141 }
142 
143 XmlReader::Result XmlReader::nextItem(Text reportText, Span * data, int * nsId)
144 {
145     switch (state_) {
146     case STATE_CONTENT:
147         switch (reportText) {
148         case TEXT_NONE:
149             return handleSkippedText(data, nsId);
150         case TEXT_RAW:
151             return handleRawText(data);
152         case TEXT_NORMALIZED:
153             return handleNormalizedText(data);
154         }
155     case STATE_START_TAG:
156         return handleStartTag(nsId, data);
157     case STATE_END_TAG:
158         return handleEndTag();
159     case STATE_EMPTY_ELEMENT_TAG:
160         handleElementEnd();
161         return RESULT_END;
162     default: // STATE_DONE
163         return RESULT_DONE;
164     }
165 }
166 
167 bool XmlReader::nextAttribute(int * nsId, Span * localName) {
168     OSL_ASSERT(nsId != 0 && localName != 0);
169     if (firstAttribute_) {
170         currentAttribute_ = attributes_.begin();
171         firstAttribute_ = false;
172     } else {
173         ++currentAttribute_;
174     }
175     if (currentAttribute_ == attributes_.end()) {
176         return false;
177     }
178     if (currentAttribute_->nameColon == 0) {
179         *nsId = NAMESPACE_NONE;
180         *localName = Span(
181             currentAttribute_->nameBegin,
182             currentAttribute_->nameEnd - currentAttribute_->nameBegin);
183     } else {
184         *nsId = getNamespaceId(
185             Span(
186                 currentAttribute_->nameBegin,
187                 currentAttribute_->nameColon - currentAttribute_->nameBegin));
188         *localName = Span(
189             currentAttribute_->nameColon + 1,
190             currentAttribute_->nameEnd - (currentAttribute_->nameColon + 1));
191     }
192     return true;
193 }
194 
195 Span XmlReader::getAttributeValue(bool fullyNormalize) {
196     return handleAttributeValue(
197         currentAttribute_->valueBegin, currentAttribute_->valueEnd,
198         fullyNormalize);
199 }
200 
201 int XmlReader::getNamespaceId(Span const & prefix) const {
202     for (NamespaceList::const_reverse_iterator i(namespaces_.rbegin());
203          i != namespaces_.rend(); ++i)
204     {
205         if (prefix.equals(i->prefix)) {
206             return i->nsId;
207         }
208     }
209     return NAMESPACE_UNKNOWN;
210 }
211 
212 rtl::OUString XmlReader::getUrl() const {
213     return fileUrl_;
214 }
215 
216 void XmlReader::normalizeLineEnds(Span const & text) {
217     char const * p = text.begin;
218     sal_Int32 n = text.length;
219     for (;;) {
220         sal_Int32 i = rtl_str_indexOfChar_WithLength(p, n, '\x0D');
221         if (i < 0) {
222             break;
223         }
224         pad_.add(p, i);
225         p += i + 1;
226         n -= i + 1;
227         if (n == 0 || *p != '\x0A') {
228             pad_.add(RTL_CONSTASCII_STRINGPARAM("\x0A"));
229         }
230     }
231     pad_.add(p, n);
232 }
233 
234 void XmlReader::skipSpace() {
235     while (isSpace(peek())) {
236         ++pos_;
237     }
238 }
239 
240 bool XmlReader::skipComment() {
241     if (rtl_str_shortenedCompare_WithLength(
242             pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("--"),
243             RTL_CONSTASCII_LENGTH("--")) !=
244         0)
245     {
246         return false;
247     }
248     pos_ += RTL_CONSTASCII_LENGTH("--");
249     sal_Int32 i = rtl_str_indexOfStr_WithLength(
250         pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("--"));
251     if (i < 0) {
252         throw css::uno::RuntimeException(
253             (rtl::OUString(
254                 RTL_CONSTASCII_USTRINGPARAM(
255                     "premature end (within comment) of ")) +
256              fileUrl_),
257             css::uno::Reference< css::uno::XInterface >());
258     }
259     pos_ += i + RTL_CONSTASCII_LENGTH("--");
260     if (read() != '>') {
261         throw css::uno::RuntimeException(
262             (rtl::OUString(
263                 RTL_CONSTASCII_USTRINGPARAM(
264                     "illegal \"--\" within comment in ")) +
265              fileUrl_),
266             css::uno::Reference< css::uno::XInterface >());
267     }
268     return true;
269 }
270 
271 void XmlReader::skipProcessingInstruction() {
272     sal_Int32 i = rtl_str_indexOfStr_WithLength(
273         pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("?>"));
274     if (i < 0) {
275         throw css::uno::RuntimeException(
276             (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("bad '<?' in ")) +
277              fileUrl_),
278             css::uno::Reference< css::uno::XInterface >());
279     }
280     pos_ += i + RTL_CONSTASCII_LENGTH("?>");
281 }
282 
283 void XmlReader::skipDocumentTypeDeclaration() {
284     // Neither is it checked that the doctypedecl is at the correct position in
285     // the document, nor that it is well-formed:
286     for (;;) {
287         char c = read();
288         switch (c) {
289         case '\0': // i.e., EOF
290             throw css::uno::RuntimeException(
291                 (rtl::OUString(
292                     RTL_CONSTASCII_USTRINGPARAM(
293                         "premature end (within DTD) of ")) +
294                  fileUrl_),
295                 css::uno::Reference< css::uno::XInterface >());
296         case '"':
297         case '\'':
298             {
299                 sal_Int32 i = rtl_str_indexOfChar_WithLength(
300                     pos_, end_ - pos_, c);
301                 if (i < 0) {
302                     throw css::uno::RuntimeException(
303                         (rtl::OUString(
304                             RTL_CONSTASCII_USTRINGPARAM(
305                                 "premature end (within DTD) of ")) +
306                          fileUrl_),
307                         css::uno::Reference< css::uno::XInterface >());
308                 }
309                 pos_ += i + 1;
310             }
311             break;
312         case '>':
313             return;
314         case '[':
315             for (;;) {
316                 c = read();
317                 switch (c) {
318                 case '\0': // i.e., EOF
319                     throw css::uno::RuntimeException(
320                         (rtl::OUString(
321                             RTL_CONSTASCII_USTRINGPARAM(
322                                 "premature end (within DTD) of ")) +
323                          fileUrl_),
324                         css::uno::Reference< css::uno::XInterface >());
325                 case '"':
326                 case '\'':
327                     {
328                         sal_Int32 i = rtl_str_indexOfChar_WithLength(
329                             pos_, end_ - pos_, c);
330                         if (i < 0) {
331                             throw css::uno::RuntimeException(
332                             (rtl::OUString(
333                                 RTL_CONSTASCII_USTRINGPARAM(
334                                     "premature end (within DTD) of ")) +
335                              fileUrl_),
336                             css::uno::Reference< css::uno::XInterface >());
337                         }
338                         pos_ += i + 1;
339                     }
340                     break;
341                 case '<':
342                     switch (read()) {
343                     case '\0': // i.e., EOF
344                         throw css::uno::RuntimeException(
345                             (rtl::OUString(
346                                 RTL_CONSTASCII_USTRINGPARAM(
347                                     "premature end (within DTD) of ")) +
348                              fileUrl_),
349                             css::uno::Reference< css::uno::XInterface >());
350                     case '!':
351                         skipComment();
352                         break;
353                     case '?':
354                         skipProcessingInstruction();
355                         break;
356                     default:
357                         break;
358                     }
359                     break;
360                 case ']':
361                     skipSpace();
362                     if (read() != '>') {
363                         throw css::uno::RuntimeException(
364                             (rtl::OUString(
365                                 RTL_CONSTASCII_USTRINGPARAM(
366                                     "missing \">\" of DTD in ")) +
367                              fileUrl_),
368                             css::uno::Reference< css::uno::XInterface >());
369                     }
370                     return;
371                 default:
372                     break;
373                 }
374             }
375         default:
376             break;
377         }
378     }
379 }
380 
381 Span XmlReader::scanCdataSection() {
382     if (rtl_str_shortenedCompare_WithLength(
383             pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("[CDATA["),
384             RTL_CONSTASCII_LENGTH("[CDATA[")) !=
385         0)
386     {
387         return Span();
388     }
389     pos_ += RTL_CONSTASCII_LENGTH("[CDATA[");
390     char const * begin = pos_;
391     sal_Int32 i = rtl_str_indexOfStr_WithLength(
392         pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("]]>"));
393     if (i < 0) {
394         throw css::uno::RuntimeException(
395             (rtl::OUString(
396                 RTL_CONSTASCII_USTRINGPARAM(
397                     "premature end (within CDATA section) of ")) +
398              fileUrl_),
399             css::uno::Reference< css::uno::XInterface >());
400     }
401     pos_ += i + RTL_CONSTASCII_LENGTH("]]>");
402     return Span(begin, i);
403 }
404 
405 bool XmlReader::scanName(char const ** nameColon) {
406     OSL_ASSERT(nameColon != 0 && *nameColon == 0);
407     for (char const * begin = pos_;; ++pos_) {
408         switch (peek()) {
409         case '\0': // i.e., EOF
410         case '\x09':
411         case '\x0A':
412         case '\x0D':
413         case ' ':
414         case '/':
415         case '=':
416         case '>':
417             return pos_ != begin;
418         case ':':
419             *nameColon = pos_;
420             break;
421         default:
422             break;
423         }
424     }
425 }
426 
427 int XmlReader::scanNamespaceIri(char const * begin, char const * end) {
428     OSL_ASSERT(begin != 0 && begin <= end);
429     Span iri(handleAttributeValue(begin, end, false));
430     for (NamespaceIris::size_type i = 0; i < namespaceIris_.size(); ++i) {
431         if (namespaceIris_[i].equals(iri)) {
432             return toNamespaceId(i);
433         }
434     }
435     return XmlReader::NAMESPACE_UNKNOWN;
436 }
437 
438 char const * XmlReader::handleReference(char const * position, char const * end)
439 {
440     OSL_ASSERT(position != 0 && *position == '&' && position < end);
441     ++position;
442     if (*position == '#') {
443         ++position;
444         sal_Int32 val = 0;
445         char const * p;
446         if (*position == 'x') {
447             ++position;
448             p = position;
449             for (;; ++position) {
450                 char c = *position;
451                 if (c >= '0' && c <= '9') {
452                     val = 16 * val + (c - '0');
453                 } else if (c >= 'A' && c <= 'F') {
454                     val = 16 * val + (c - 'A') + 10;
455                 } else if (c >= 'a' && c <= 'f') {
456                     val = 16 * val + (c - 'a') + 10;
457                 } else {
458                     break;
459                 }
460                 if (val > 0x10FFFF) { // avoid overflow
461                     throw css::uno::RuntimeException(
462                         (rtl::OUString(
463                             RTL_CONSTASCII_USTRINGPARAM(
464                                 "'&#x...' too large in ")) +
465                          fileUrl_),
466                         css::uno::Reference< css::uno::XInterface >());
467                 }
468             }
469         } else {
470             p = position;
471             for (;; ++position) {
472                 char c = *position;
473                 if (c >= '0' && c <= '9') {
474                     val = 10 * val + (c - '0');
475                 } else {
476                     break;
477                 }
478                 if (val > 0x10FFFF) { // avoid overflow
479                     throw css::uno::RuntimeException(
480                         (rtl::OUString(
481                             RTL_CONSTASCII_USTRINGPARAM(
482                                 "'&#...' too large in ")) +
483                          fileUrl_),
484                         css::uno::Reference< css::uno::XInterface >());
485                 }
486             }
487         }
488         if (position == p || *position++ != ';') {
489             throw css::uno::RuntimeException(
490                 (rtl::OUString(
491                     RTL_CONSTASCII_USTRINGPARAM("'&#...' missing ';' in ")) +
492                  fileUrl_),
493                 css::uno::Reference< css::uno::XInterface >());
494         }
495         OSL_ASSERT(val >= 0 && val <= 0x10FFFF);
496         if ((val < 0x20 && val != 0x9 && val != 0xA && val != 0xD) ||
497             (val >= 0xD800 && val <= 0xDFFF) || val == 0xFFFE || val == 0xFFFF)
498         {
499             throw css::uno::RuntimeException(
500                 (rtl::OUString(
501                     RTL_CONSTASCII_USTRINGPARAM(
502                         "character reference denoting invalid character in ")) +
503                  fileUrl_),
504                 css::uno::Reference< css::uno::XInterface >());
505         }
506         char buf[4];
507         sal_Int32 len;
508         if (val < 0x80) {
509             buf[0] = static_cast< char >(val);
510             len = 1;
511         } else if (val < 0x800) {
512             buf[0] = static_cast< char >((val >> 6) | 0xC0);
513             buf[1] = static_cast< char >((val & 0x3F) | 0x80);
514             len = 2;
515         } else if (val < 0x10000) {
516             buf[0] = static_cast< char >((val >> 12) | 0xE0);
517             buf[1] = static_cast< char >(((val >> 6) & 0x3F) | 0x80);
518             buf[2] = static_cast< char >((val & 0x3F) | 0x80);
519             len = 3;
520         } else {
521             buf[0] = static_cast< char >((val >> 18) | 0xF0);
522             buf[1] = static_cast< char >(((val >> 12) & 0x3F) | 0x80);
523             buf[2] = static_cast< char >(((val >> 6) & 0x3F) | 0x80);
524             buf[3] = static_cast< char >((val & 0x3F) | 0x80);
525             len = 4;
526         }
527         pad_.addEphemeral(buf, len);
528         return position;
529     } else {
530         struct EntityRef {
531             char const * inBegin;
532             sal_Int32 inLength;
533             char const * outBegin;
534             sal_Int32 outLength;
535         };
536         static EntityRef const refs[] = {
537             { RTL_CONSTASCII_STRINGPARAM("amp;"),
538               RTL_CONSTASCII_STRINGPARAM("&") },
539             { RTL_CONSTASCII_STRINGPARAM("lt;"),
540               RTL_CONSTASCII_STRINGPARAM("<") },
541             { RTL_CONSTASCII_STRINGPARAM("gt;"),
542               RTL_CONSTASCII_STRINGPARAM(">") },
543             { RTL_CONSTASCII_STRINGPARAM("apos;"),
544               RTL_CONSTASCII_STRINGPARAM("'") },
545             { RTL_CONSTASCII_STRINGPARAM("quot;"),
546               RTL_CONSTASCII_STRINGPARAM("\"") } };
547         for (std::size_t i = 0; i < sizeof refs / sizeof refs[0]; ++i) {
548             if (rtl_str_shortenedCompare_WithLength(
549                     position, end - position, refs[i].inBegin, refs[i].inLength,
550                     refs[i].inLength) ==
551                 0)
552             {
553                 position += refs[i].inLength;
554                 pad_.add(refs[i].outBegin, refs[i].outLength);
555                 return position;
556             }
557         }
558         throw css::uno::RuntimeException(
559             (rtl::OUString(
560                 RTL_CONSTASCII_USTRINGPARAM("unknown entity reference in ")) +
561              fileUrl_),
562             css::uno::Reference< css::uno::XInterface >());
563     }
564 }
565 
566 Span XmlReader::handleAttributeValue(
567     char const * begin, char const * end, bool fullyNormalize)
568 {
569     pad_.clear();
570     if (fullyNormalize) {
571         while (begin != end && isSpace(*begin)) {
572             ++begin;
573         }
574         while (end != begin && isSpace(end[-1])) {
575             --end;
576         }
577         char const * p = begin;
578         enum Space { SPACE_NONE, SPACE_SPAN, SPACE_BREAK };
579             // a single true space character can go into the current span,
580             // everything else breaks the span
581         Space space = SPACE_NONE;
582         while (p != end) {
583             switch (*p) {
584             case '\x09':
585             case '\x0A':
586             case '\x0D':
587                 switch (space) {
588                 case SPACE_NONE:
589                     pad_.add(begin, p - begin);
590                     pad_.add(RTL_CONSTASCII_STRINGPARAM(" "));
591                     space = SPACE_BREAK;
592                     break;
593                 case SPACE_SPAN:
594                     pad_.add(begin, p - begin);
595                     space = SPACE_BREAK;
596                     break;
597                 case SPACE_BREAK:
598                     break;
599                 }
600                 begin = ++p;
601                 break;
602             case ' ':
603                 switch (space) {
604                 case SPACE_NONE:
605                     ++p;
606                     space = SPACE_SPAN;
607                     break;
608                 case SPACE_SPAN:
609                     pad_.add(begin, p - begin);
610                     begin = ++p;
611                     space = SPACE_BREAK;
612                     break;
613                 case SPACE_BREAK:
614                     begin = ++p;
615                     break;
616                 }
617                 break;
618             case '&':
619                 pad_.add(begin, p - begin);
620                 p = handleReference(p, end);
621                 begin = p;
622                 space = SPACE_NONE;
623                 break;
624             default:
625                 ++p;
626                 space = SPACE_NONE;
627                 break;
628             }
629         }
630         pad_.add(begin, p - begin);
631     } else {
632         char const * p = begin;
633         while (p != end) {
634             switch (*p) {
635             case '\x09':
636             case '\x0A':
637                 pad_.add(begin, p - begin);
638                 begin = ++p;
639                 pad_.add(RTL_CONSTASCII_STRINGPARAM(" "));
640                 break;
641             case '\x0D':
642                 pad_.add(begin, p - begin);
643                 ++p;
644                 if (peek() == '\x0A') {
645                     ++p;
646                 }
647                 begin = p;
648                 pad_.add(RTL_CONSTASCII_STRINGPARAM(" "));
649                 break;
650             case '&':
651                 pad_.add(begin, p - begin);
652                 p = handleReference(p, end);
653                 begin = p;
654                 break;
655             default:
656                 ++p;
657                 break;
658             }
659         }
660         pad_.add(begin, p - begin);
661     }
662     return pad_.get();
663 }
664 
665 XmlReader::Result XmlReader::handleStartTag(int * nsId, Span * localName) {
666     OSL_ASSERT(nsId != 0 && localName);
667     char const * nameBegin = pos_;
668     char const * nameColon = 0;
669     if (!scanName(&nameColon)) {
670         throw css::uno::RuntimeException(
671             (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("bad tag name in ")) +
672              fileUrl_),
673             css::uno::Reference< css::uno::XInterface >());
674     }
675     char const * nameEnd = pos_;
676     NamespaceList::size_type inheritedNamespaces = namespaces_.size();
677     bool hasDefaultNs = false;
678     int defaultNsId = NAMESPACE_NONE;
679     attributes_.clear();
680     for (;;) {
681         char const * p = pos_;
682         skipSpace();
683         if (peek() == '/' || peek() == '>') {
684             break;
685         }
686         if (pos_ == p) {
687             throw css::uno::RuntimeException(
688                 (rtl::OUString(
689                     RTL_CONSTASCII_USTRINGPARAM(
690                         "missing whitespace before attribute in ")) +
691                  fileUrl_),
692                 css::uno::Reference< css::uno::XInterface >());
693         }
694         char const * attrNameBegin = pos_;
695         char const * attrNameColon = 0;
696         if (!scanName(&attrNameColon)) {
697             throw css::uno::RuntimeException(
698                 (rtl::OUString(
699                     RTL_CONSTASCII_USTRINGPARAM("bad attribute name in ")) +
700                  fileUrl_),
701                 css::uno::Reference< css::uno::XInterface >());
702         }
703         char const * attrNameEnd = pos_;
704         skipSpace();
705         if (read() != '=') {
706             throw css::uno::RuntimeException(
707                 (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("missing '=' in ")) +
708                  fileUrl_),
709                 css::uno::Reference< css::uno::XInterface >());
710         }
711         skipSpace();
712         char del = read();
713         if (del != '\'' && del != '"') {
714             throw css::uno::RuntimeException(
715                 (rtl::OUString(
716                     RTL_CONSTASCII_USTRINGPARAM("bad attribute value in ")) +
717                  fileUrl_),
718                 css::uno::Reference< css::uno::XInterface >());
719         }
720         char const * valueBegin = pos_;
721         sal_Int32 i = rtl_str_indexOfChar_WithLength(pos_, end_ - pos_, del);
722         if (i < 0) {
723             throw css::uno::RuntimeException(
724                 (rtl::OUString(
725                     RTL_CONSTASCII_USTRINGPARAM(
726                         "unterminated attribute value in ")) +
727                  fileUrl_),
728                 css::uno::Reference< css::uno::XInterface >());
729         }
730         char const * valueEnd = pos_ + i;
731         pos_ += i + 1;
732         if (attrNameColon == 0 &&
733             Span(attrNameBegin, attrNameEnd - attrNameBegin).equals(
734                 RTL_CONSTASCII_STRINGPARAM("xmlns")))
735         {
736             hasDefaultNs = true;
737             defaultNsId = scanNamespaceIri(valueBegin, valueEnd);
738         } else if (attrNameColon != 0 &&
739                    Span(attrNameBegin, attrNameColon - attrNameBegin).equals(
740                        RTL_CONSTASCII_STRINGPARAM("xmlns")))
741         {
742             namespaces_.push_back(
743                 NamespaceData(
744                     Span(attrNameColon + 1, attrNameEnd - (attrNameColon + 1)),
745                     scanNamespaceIri(valueBegin, valueEnd)));
746         } else {
747             attributes_.push_back(
748                 AttributeData(
749                     attrNameBegin, attrNameEnd, attrNameColon, valueBegin,
750                     valueEnd));
751         }
752     }
753     if (!hasDefaultNs && !elements_.empty()) {
754         defaultNsId = elements_.top().defaultNamespaceId;
755     }
756     firstAttribute_ = true;
757     if (peek() == '/') {
758         state_ = STATE_EMPTY_ELEMENT_TAG;
759         ++pos_;
760     } else {
761         state_ = STATE_CONTENT;
762     }
763     if (peek() != '>') {
764         throw css::uno::RuntimeException(
765             (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("missing '>' in ")) +
766              fileUrl_),
767             css::uno::Reference< css::uno::XInterface >());
768     }
769     ++pos_;
770     elements_.push(
771         ElementData(
772             Span(nameBegin, nameEnd - nameBegin), inheritedNamespaces,
773             defaultNsId));
774     if (nameColon == 0) {
775         *nsId = defaultNsId;
776         *localName = Span(nameBegin, nameEnd - nameBegin);
777     } else {
778         *nsId = getNamespaceId(Span(nameBegin, nameColon - nameBegin));
779         *localName = Span(nameColon + 1, nameEnd - (nameColon + 1));
780     }
781     return RESULT_BEGIN;
782 }
783 
784 XmlReader::Result XmlReader::handleEndTag() {
785     if (elements_.empty()) {
786         throw css::uno::RuntimeException(
787             (rtl::OUString(
788                 RTL_CONSTASCII_USTRINGPARAM("spurious end tag in ")) +
789              fileUrl_),
790             css::uno::Reference< css::uno::XInterface >());
791     }
792     char const * nameBegin = pos_;
793     char const * nameColon = 0;
794     if (!scanName(&nameColon) ||
795         !elements_.top().name.equals(nameBegin, pos_ - nameBegin))
796     {
797         throw css::uno::RuntimeException(
798             (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("tag mismatch in ")) +
799              fileUrl_),
800             css::uno::Reference< css::uno::XInterface >());
801     }
802     handleElementEnd();
803     skipSpace();
804     if (peek() != '>') {
805         throw css::uno::RuntimeException(
806             (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("missing '>' in ")) +
807              fileUrl_),
808             css::uno::Reference< css::uno::XInterface >());
809     }
810     ++pos_;
811     return RESULT_END;
812 }
813 
814 void XmlReader::handleElementEnd() {
815     OSL_ASSERT(!elements_.empty());
816     namespaces_.resize(elements_.top().inheritedNamespaces);
817     elements_.pop();
818     state_ = elements_.empty() ? STATE_DONE : STATE_CONTENT;
819 }
820 
821 XmlReader::Result XmlReader::handleSkippedText(Span * data, int * nsId) {
822     for (;;) {
823         sal_Int32 i = rtl_str_indexOfChar_WithLength(pos_, end_ - pos_, '<');
824         if (i < 0) {
825             throw css::uno::RuntimeException(
826                 (rtl::OUString(
827                     RTL_CONSTASCII_USTRINGPARAM("premature end of ")) +
828                  fileUrl_),
829                 css::uno::Reference< css::uno::XInterface >());
830         }
831         pos_ += i + 1;
832         switch (peek()) {
833         case '!':
834             ++pos_;
835             if (!skipComment() && !scanCdataSection().is()) {
836                 skipDocumentTypeDeclaration();
837             }
838             break;
839         case '/':
840             ++pos_;
841             return handleEndTag();
842         case '?':
843             ++pos_;
844             skipProcessingInstruction();
845             break;
846         default:
847             return handleStartTag(nsId, data);
848         }
849     }
850 }
851 
852 XmlReader::Result XmlReader::handleRawText(Span * text) {
853     pad_.clear();
854     for (char const * begin = pos_;;) {
855         switch (peek()) {
856         case '\0': // i.e., EOF
857             throw css::uno::RuntimeException(
858                 (rtl::OUString(
859                     RTL_CONSTASCII_USTRINGPARAM("premature end of ")) +
860                  fileUrl_),
861                 css::uno::Reference< css::uno::XInterface >());
862         case '\x0D':
863             pad_.add(begin, pos_ - begin);
864             ++pos_;
865             if (peek() != '\x0A') {
866                 pad_.add(RTL_CONSTASCII_STRINGPARAM("\x0A"));
867             }
868             begin = pos_;
869             break;
870         case '&':
871             pad_.add(begin, pos_ - begin);
872             pos_ = handleReference(pos_, end_);
873             begin = pos_;
874             break;
875         case '<':
876             pad_.add(begin, pos_ - begin);
877             ++pos_;
878             switch (peek()) {
879             case '!':
880                 ++pos_;
881                 if (!skipComment()) {
882                     Span cdata(scanCdataSection());
883                     if (cdata.is()) {
884                         normalizeLineEnds(cdata);
885                     } else {
886                         skipDocumentTypeDeclaration();
887                     }
888                 }
889                 begin = pos_;
890                 break;
891             case '/':
892                 *text = pad_.get();
893                 ++pos_;
894                 state_ = STATE_END_TAG;
895                 return RESULT_TEXT;
896             case '?':
897                 ++pos_;
898                 skipProcessingInstruction();
899                 begin = pos_;
900                 break;
901             default:
902                 *text = pad_.get();
903                 state_ = STATE_START_TAG;
904                 return RESULT_TEXT;
905             }
906             break;
907         default:
908             ++pos_;
909             break;
910         }
911     }
912 }
913 
914 XmlReader::Result XmlReader::handleNormalizedText(Span * text) {
915     pad_.clear();
916     char const * flowBegin = pos_;
917     char const * flowEnd = pos_;
918     enum Space { SPACE_START, SPACE_NONE, SPACE_SPAN, SPACE_BREAK };
919         // a single true space character can go into the current flow,
920         // everything else breaks the flow
921     Space space = SPACE_START;
922     for (;;) {
923         switch (peek()) {
924         case '\0': // i.e., EOF
925             throw css::uno::RuntimeException(
926                 (rtl::OUString(
927                     RTL_CONSTASCII_USTRINGPARAM("premature end of ")) +
928                  fileUrl_),
929                 css::uno::Reference< css::uno::XInterface >());
930         case '\x09':
931         case '\x0A':
932         case '\x0D':
933             switch (space) {
934             case SPACE_START:
935             case SPACE_BREAK:
936                 break;
937             case SPACE_NONE:
938             case SPACE_SPAN:
939                 space = SPACE_BREAK;
940                 break;
941             }
942             ++pos_;
943             break;
944         case ' ':
945             switch (space) {
946             case SPACE_START:
947             case SPACE_BREAK:
948                 break;
949             case SPACE_NONE:
950                 space = SPACE_SPAN;
951                 break;
952             case SPACE_SPAN:
953                 space = SPACE_BREAK;
954                 break;
955             }
956             ++pos_;
957             break;
958         case '&':
959             switch (space) {
960             case SPACE_START:
961                 break;
962             case SPACE_NONE:
963             case SPACE_SPAN:
964                 pad_.add(flowBegin, pos_ - flowBegin);
965                 break;
966             case SPACE_BREAK:
967                 pad_.add(flowBegin, flowEnd - flowBegin);
968                 pad_.add(RTL_CONSTASCII_STRINGPARAM(" "));
969                 break;
970             }
971             pos_ = handleReference(pos_, end_);
972             flowBegin = pos_;
973             flowEnd = pos_;
974             space = SPACE_NONE;
975             break;
976         case '<':
977             ++pos_;
978             switch (peek()) {
979             case '!':
980                 ++pos_;
981                 if (skipComment()) {
982                     space = SPACE_BREAK;
983                 } else {
984                     Span cdata(scanCdataSection());
985                     if (cdata.is()) {
986                         // CDATA is not normalized (similar to character
987                         // references; it keeps the code simple), but it might
988                         // arguably be better to normalize it:
989                         switch (space) {
990                         case SPACE_START:
991                             break;
992                         case SPACE_NONE:
993                         case SPACE_SPAN:
994                             pad_.add(flowBegin, pos_ - flowBegin);
995                             break;
996                         case SPACE_BREAK:
997                             pad_.add(flowBegin, flowEnd - flowBegin);
998                             pad_.add(RTL_CONSTASCII_STRINGPARAM(" "));
999                             break;
1000                         }
1001                         normalizeLineEnds(cdata);
1002                         flowBegin = pos_;
1003                         flowEnd = pos_;
1004                         space = SPACE_NONE;
1005                     } else {
1006                         skipDocumentTypeDeclaration();
1007                     }
1008                 }
1009                 break;
1010             case '/':
1011                 ++pos_;
1012                 pad_.add(flowBegin, flowEnd - flowBegin);
1013                 *text = pad_.get();
1014                 state_ = STATE_END_TAG;
1015                 return RESULT_TEXT;
1016             case '?':
1017                 ++pos_;
1018                 skipProcessingInstruction();
1019                 space = SPACE_BREAK;
1020                 break;
1021             default:
1022                 pad_.add(flowBegin, flowEnd - flowBegin);
1023                 *text = pad_.get();
1024                 state_ = STATE_START_TAG;
1025                 return RESULT_TEXT;
1026             }
1027             break;
1028         default:
1029             switch (space) {
1030             case SPACE_START:
1031                 flowBegin = pos_;
1032                 break;
1033             case SPACE_NONE:
1034             case SPACE_SPAN:
1035                 break;
1036             case SPACE_BREAK:
1037                 pad_.add(flowBegin, flowEnd - flowBegin);
1038                 pad_.add(RTL_CONSTASCII_STRINGPARAM(" "));
1039                 flowBegin = pos_;
1040                 break;
1041             }
1042             flowEnd = ++pos_;
1043             space = SPACE_NONE;
1044             break;
1045         }
1046     }
1047 }
1048 
1049 int XmlReader::toNamespaceId(NamespaceIris::size_type pos) {
1050     OSL_ASSERT(pos <= INT_MAX);
1051     return static_cast< int >(pos);
1052 }
1053 
1054 }
1055