1 /************************************************************** 2 * 3 * Licensed to the Apache Software Foundation (ASF) under one 4 * or more contributor license agreements. See the NOTICE file 5 * distributed with this work for additional information 6 * regarding copyright ownership. The ASF licenses this file 7 * to you under the Apache License, Version 2.0 (the 8 * "License"); you may not use this file except in compliance 9 * with the License. You may obtain a copy of the License at 10 * 11 * http://www.apache.org/licenses/LICENSE-2.0 12 * 13 * Unless required by applicable law or agreed to in writing, 14 * software distributed under the License is distributed on an 15 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 * KIND, either express or implied. See the License for the 17 * specific language governing permissions and limitations 18 * under the License. 19 * 20 *************************************************************/ 21 22 23 24 #include "precompiled_xmlreader.hxx" 25 #include "sal/config.h" 26 27 #include <climits> 28 #include <cstddef> 29 30 #include "com/sun/star/container/NoSuchElementException.hpp" 31 #include "com/sun/star/uno/Reference.hxx" 32 #include "com/sun/star/uno/RuntimeException.hpp" 33 #include "com/sun/star/uno/XInterface.hpp" 34 #include "osl/diagnose.h" 35 #include "osl/file.h" 36 #include "rtl/string.h" 37 #include "rtl/ustring.h" 38 #include "rtl/ustring.hxx" 39 #include "sal/types.h" 40 #include "xmlreader/pad.hxx" 41 #include "xmlreader/span.hxx" 42 #include "xmlreader/xmlreader.hxx" 43 44 namespace xmlreader { 45 46 namespace { 47 48 namespace css = com::sun::star; 49 50 bool isSpace(char c) { 51 switch (c) { 52 case '\x09': 53 case '\x0A': 54 case '\x0D': 55 case ' ': 56 return true; 57 default: 58 return false; 59 } 60 } 61 62 } 63 64 XmlReader::XmlReader(rtl::OUString const & fileUrl) 65 SAL_THROW(( 66 css::container::NoSuchElementException, css::uno::RuntimeException)): 67 fileUrl_(fileUrl) 68 { 69 switch (osl_openFile(fileUrl_.pData, &fileHandle_, osl_File_OpenFlag_Read)) 70 { 71 case osl_File_E_None: 72 break; 73 case osl_File_E_NOENT: 74 throw css::container::NoSuchElementException( 75 fileUrl_, css::uno::Reference< css::uno::XInterface >()); 76 default: 77 throw css::uno::RuntimeException( 78 (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("cannot open ")) + 79 fileUrl_), 80 css::uno::Reference< css::uno::XInterface >()); 81 } 82 oslFileError e = osl_getFileSize(fileHandle_, &fileSize_); 83 if (e == osl_File_E_None) { 84 e = osl_mapFile( 85 fileHandle_, &fileAddress_, fileSize_, 0, 86 osl_File_MapFlag_WillNeed); 87 } 88 if (e != osl_File_E_None) { 89 e = osl_closeFile(fileHandle_); 90 if (e != osl_File_E_None) { 91 OSL_TRACE("osl_closeFile failed with %ld", static_cast< long >(e)); 92 } 93 throw css::uno::RuntimeException( 94 (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("cannot mmap ")) + 95 fileUrl_), 96 css::uno::Reference< css::uno::XInterface >()); 97 } 98 namespaceIris_.push_back( 99 Span( 100 RTL_CONSTASCII_STRINGPARAM( 101 "http://www.w3.org/XML/1998/namespace"))); 102 namespaces_.push_back( 103 NamespaceData(Span(RTL_CONSTASCII_STRINGPARAM("xml")), NAMESPACE_XML)); 104 pos_ = static_cast< char * >(fileAddress_); 105 end_ = pos_ + fileSize_; 106 state_ = STATE_CONTENT; 107 } 108 109 XmlReader::~XmlReader() { 110 oslFileError e = osl_unmapFile(fileAddress_, fileSize_); 111 if (e != osl_File_E_None) { 112 OSL_TRACE("osl_unmapFile failed with %ld", static_cast< long >(e)); 113 } 114 e = osl_closeFile(fileHandle_); 115 if (e != osl_File_E_None) { 116 OSL_TRACE("osl_closeFile failed with %ld", static_cast< long >(e)); 117 } 118 } 119 120 int XmlReader::registerNamespaceIri(Span const & iri) { 121 int id = toNamespaceId(namespaceIris_.size()); 122 namespaceIris_.push_back(iri); 123 if (iri.equals( 124 Span( 125 RTL_CONSTASCII_STRINGPARAM( 126 "http://www.w3.org/2001/XMLSchema-instance")))) 127 { 128 // Old user layer .xcu files used the xsi namespace prefix without 129 // declaring a corresponding namespace binding, see issue 77174; reading 130 // those files during migration would fail without this hack that can be 131 // removed once migration is no longer relevant (see 132 // configmgr::Components::parseModificationLayer): 133 namespaces_.push_back( 134 NamespaceData(Span(RTL_CONSTASCII_STRINGPARAM("xsi")), id)); 135 } 136 return id; 137 } 138 139 XmlReader::Result XmlReader::nextItem(Text reportText, Span * data, int * nsId) 140 { 141 switch (state_) { 142 case STATE_CONTENT: 143 switch (reportText) { 144 case TEXT_NONE: 145 return handleSkippedText(data, nsId); 146 case TEXT_RAW: 147 return handleRawText(data); 148 case TEXT_NORMALIZED: 149 return handleNormalizedText(data); 150 } 151 case STATE_START_TAG: 152 return handleStartTag(nsId, data); 153 case STATE_END_TAG: 154 return handleEndTag(); 155 case STATE_EMPTY_ELEMENT_TAG: 156 handleElementEnd(); 157 return RESULT_END; 158 default: // STATE_DONE 159 return RESULT_DONE; 160 } 161 } 162 163 bool XmlReader::nextAttribute(int * nsId, Span * localName) { 164 OSL_ASSERT(nsId != 0 && localName != 0); 165 if (firstAttribute_) { 166 currentAttribute_ = attributes_.begin(); 167 firstAttribute_ = false; 168 } else { 169 ++currentAttribute_; 170 } 171 if (currentAttribute_ == attributes_.end()) { 172 return false; 173 } 174 if (currentAttribute_->nameColon == 0) { 175 *nsId = NAMESPACE_NONE; 176 *localName = Span( 177 currentAttribute_->nameBegin, 178 currentAttribute_->nameEnd - currentAttribute_->nameBegin); 179 } else { 180 *nsId = getNamespaceId( 181 Span( 182 currentAttribute_->nameBegin, 183 currentAttribute_->nameColon - currentAttribute_->nameBegin)); 184 *localName = Span( 185 currentAttribute_->nameColon + 1, 186 currentAttribute_->nameEnd - (currentAttribute_->nameColon + 1)); 187 } 188 return true; 189 } 190 191 Span XmlReader::getAttributeValue(bool fullyNormalize) { 192 return handleAttributeValue( 193 currentAttribute_->valueBegin, currentAttribute_->valueEnd, 194 fullyNormalize); 195 } 196 197 int XmlReader::getNamespaceId(Span const & prefix) const { 198 for (NamespaceList::const_reverse_iterator i(namespaces_.rbegin()); 199 i != namespaces_.rend(); ++i) 200 { 201 if (prefix.equals(i->prefix)) { 202 return i->nsId; 203 } 204 } 205 return NAMESPACE_UNKNOWN; 206 } 207 208 rtl::OUString XmlReader::getUrl() const { 209 return fileUrl_; 210 } 211 212 void XmlReader::normalizeLineEnds(Span const & text) { 213 char const * p = text.begin; 214 sal_Int32 n = text.length; 215 for (;;) { 216 sal_Int32 i = rtl_str_indexOfChar_WithLength(p, n, '\x0D'); 217 if (i < 0) { 218 break; 219 } 220 pad_.add(p, i); 221 p += i + 1; 222 n -= i + 1; 223 if (n == 0 || *p != '\x0A') { 224 pad_.add(RTL_CONSTASCII_STRINGPARAM("\x0A")); 225 } 226 } 227 pad_.add(p, n); 228 } 229 230 void XmlReader::skipSpace() { 231 while (isSpace(peek())) { 232 ++pos_; 233 } 234 } 235 236 bool XmlReader::skipComment() { 237 if (rtl_str_shortenedCompare_WithLength( 238 pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("--"), 239 RTL_CONSTASCII_LENGTH("--")) != 240 0) 241 { 242 return false; 243 } 244 pos_ += RTL_CONSTASCII_LENGTH("--"); 245 sal_Int32 i = rtl_str_indexOfStr_WithLength( 246 pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("--")); 247 if (i < 0) { 248 throw css::uno::RuntimeException( 249 (rtl::OUString( 250 RTL_CONSTASCII_USTRINGPARAM( 251 "premature end (within comment) of ")) + 252 fileUrl_), 253 css::uno::Reference< css::uno::XInterface >()); 254 } 255 pos_ += i + RTL_CONSTASCII_LENGTH("--"); 256 if (read() != '>') { 257 throw css::uno::RuntimeException( 258 (rtl::OUString( 259 RTL_CONSTASCII_USTRINGPARAM( 260 "illegal \"--\" within comment in ")) + 261 fileUrl_), 262 css::uno::Reference< css::uno::XInterface >()); 263 } 264 return true; 265 } 266 267 void XmlReader::skipProcessingInstruction() { 268 sal_Int32 i = rtl_str_indexOfStr_WithLength( 269 pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("?>")); 270 if (i < 0) { 271 throw css::uno::RuntimeException( 272 (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("bad '<?' in ")) + 273 fileUrl_), 274 css::uno::Reference< css::uno::XInterface >()); 275 } 276 pos_ += i + RTL_CONSTASCII_LENGTH("?>"); 277 } 278 279 void XmlReader::skipDocumentTypeDeclaration() { 280 // Neither is it checked that the doctypedecl is at the correct position in 281 // the document, nor that it is well-formed: 282 for (;;) { 283 char c = read(); 284 switch (c) { 285 case '\0': // i.e., EOF 286 throw css::uno::RuntimeException( 287 (rtl::OUString( 288 RTL_CONSTASCII_USTRINGPARAM( 289 "premature end (within DTD) of ")) + 290 fileUrl_), 291 css::uno::Reference< css::uno::XInterface >()); 292 case '"': 293 case '\'': 294 { 295 sal_Int32 i = rtl_str_indexOfChar_WithLength( 296 pos_, end_ - pos_, c); 297 if (i < 0) { 298 throw css::uno::RuntimeException( 299 (rtl::OUString( 300 RTL_CONSTASCII_USTRINGPARAM( 301 "premature end (within DTD) of ")) + 302 fileUrl_), 303 css::uno::Reference< css::uno::XInterface >()); 304 } 305 pos_ += i + 1; 306 } 307 break; 308 case '>': 309 return; 310 case '[': 311 for (;;) { 312 c = read(); 313 switch (c) { 314 case '\0': // i.e., EOF 315 throw css::uno::RuntimeException( 316 (rtl::OUString( 317 RTL_CONSTASCII_USTRINGPARAM( 318 "premature end (within DTD) of ")) + 319 fileUrl_), 320 css::uno::Reference< css::uno::XInterface >()); 321 case '"': 322 case '\'': 323 { 324 sal_Int32 i = rtl_str_indexOfChar_WithLength( 325 pos_, end_ - pos_, c); 326 if (i < 0) { 327 throw css::uno::RuntimeException( 328 (rtl::OUString( 329 RTL_CONSTASCII_USTRINGPARAM( 330 "premature end (within DTD) of ")) + 331 fileUrl_), 332 css::uno::Reference< css::uno::XInterface >()); 333 } 334 pos_ += i + 1; 335 } 336 break; 337 case '<': 338 switch (read()) { 339 case '\0': // i.e., EOF 340 throw css::uno::RuntimeException( 341 (rtl::OUString( 342 RTL_CONSTASCII_USTRINGPARAM( 343 "premature end (within DTD) of ")) + 344 fileUrl_), 345 css::uno::Reference< css::uno::XInterface >()); 346 case '!': 347 skipComment(); 348 break; 349 case '?': 350 skipProcessingInstruction(); 351 break; 352 default: 353 break; 354 } 355 break; 356 case ']': 357 skipSpace(); 358 if (read() != '>') { 359 throw css::uno::RuntimeException( 360 (rtl::OUString( 361 RTL_CONSTASCII_USTRINGPARAM( 362 "missing \">\" of DTD in ")) + 363 fileUrl_), 364 css::uno::Reference< css::uno::XInterface >()); 365 } 366 return; 367 default: 368 break; 369 } 370 } 371 default: 372 break; 373 } 374 } 375 } 376 377 Span XmlReader::scanCdataSection() { 378 if (rtl_str_shortenedCompare_WithLength( 379 pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("[CDATA["), 380 RTL_CONSTASCII_LENGTH("[CDATA[")) != 381 0) 382 { 383 return Span(); 384 } 385 pos_ += RTL_CONSTASCII_LENGTH("[CDATA["); 386 char const * begin = pos_; 387 sal_Int32 i = rtl_str_indexOfStr_WithLength( 388 pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("]]>")); 389 if (i < 0) { 390 throw css::uno::RuntimeException( 391 (rtl::OUString( 392 RTL_CONSTASCII_USTRINGPARAM( 393 "premature end (within CDATA section) of ")) + 394 fileUrl_), 395 css::uno::Reference< css::uno::XInterface >()); 396 } 397 pos_ += i + RTL_CONSTASCII_LENGTH("]]>"); 398 return Span(begin, i); 399 } 400 401 bool XmlReader::scanName(char const ** nameColon) { 402 OSL_ASSERT(nameColon != 0 && *nameColon == 0); 403 for (char const * begin = pos_;; ++pos_) { 404 switch (peek()) { 405 case '\0': // i.e., EOF 406 case '\x09': 407 case '\x0A': 408 case '\x0D': 409 case ' ': 410 case '/': 411 case '=': 412 case '>': 413 return pos_ != begin; 414 case ':': 415 *nameColon = pos_; 416 break; 417 default: 418 break; 419 } 420 } 421 } 422 423 int XmlReader::scanNamespaceIri(char const * begin, char const * end) { 424 OSL_ASSERT(begin != 0 && begin <= end); 425 Span iri(handleAttributeValue(begin, end, false)); 426 for (NamespaceIris::size_type i = 0; i < namespaceIris_.size(); ++i) { 427 if (namespaceIris_[i].equals(iri)) { 428 return toNamespaceId(i); 429 } 430 } 431 return XmlReader::NAMESPACE_UNKNOWN; 432 } 433 434 char const * XmlReader::handleReference(char const * position, char const * end) 435 { 436 OSL_ASSERT(position != 0 && *position == '&' && position < end); 437 ++position; 438 if (*position == '#') { 439 ++position; 440 sal_Int32 val = 0; 441 char const * p; 442 if (*position == 'x') { 443 ++position; 444 p = position; 445 for (;; ++position) { 446 char c = *position; 447 if (c >= '0' && c <= '9') { 448 val = 16 * val + (c - '0'); 449 } else if (c >= 'A' && c <= 'F') { 450 val = 16 * val + (c - 'A') + 10; 451 } else if (c >= 'a' && c <= 'f') { 452 val = 16 * val + (c - 'a') + 10; 453 } else { 454 break; 455 } 456 if (val > 0x10FFFF) { // avoid overflow 457 throw css::uno::RuntimeException( 458 (rtl::OUString( 459 RTL_CONSTASCII_USTRINGPARAM( 460 "'&#x...' too large in ")) + 461 fileUrl_), 462 css::uno::Reference< css::uno::XInterface >()); 463 } 464 } 465 } else { 466 p = position; 467 for (;; ++position) { 468 char c = *position; 469 if (c >= '0' && c <= '9') { 470 val = 10 * val + (c - '0'); 471 } else { 472 break; 473 } 474 if (val > 0x10FFFF) { // avoid overflow 475 throw css::uno::RuntimeException( 476 (rtl::OUString( 477 RTL_CONSTASCII_USTRINGPARAM( 478 "'&#...' too large in ")) + 479 fileUrl_), 480 css::uno::Reference< css::uno::XInterface >()); 481 } 482 } 483 } 484 if (position == p || *position++ != ';') { 485 throw css::uno::RuntimeException( 486 (rtl::OUString( 487 RTL_CONSTASCII_USTRINGPARAM("'&#...' missing ';' in ")) + 488 fileUrl_), 489 css::uno::Reference< css::uno::XInterface >()); 490 } 491 OSL_ASSERT(val >= 0 && val <= 0x10FFFF); 492 if ((val < 0x20 && val != 0x9 && val != 0xA && val != 0xD) || 493 (val >= 0xD800 && val <= 0xDFFF) || val == 0xFFFE || val == 0xFFFF) 494 { 495 throw css::uno::RuntimeException( 496 (rtl::OUString( 497 RTL_CONSTASCII_USTRINGPARAM( 498 "character reference denoting invalid character in ")) + 499 fileUrl_), 500 css::uno::Reference< css::uno::XInterface >()); 501 } 502 char buf[4]; 503 sal_Int32 len; 504 if (val < 0x80) { 505 buf[0] = static_cast< char >(val); 506 len = 1; 507 } else if (val < 0x800) { 508 buf[0] = static_cast< char >((val >> 6) | 0xC0); 509 buf[1] = static_cast< char >((val & 0x3F) | 0x80); 510 len = 2; 511 } else if (val < 0x10000) { 512 buf[0] = static_cast< char >((val >> 12) | 0xE0); 513 buf[1] = static_cast< char >(((val >> 6) & 0x3F) | 0x80); 514 buf[2] = static_cast< char >((val & 0x3F) | 0x80); 515 len = 3; 516 } else { 517 buf[0] = static_cast< char >((val >> 18) | 0xF0); 518 buf[1] = static_cast< char >(((val >> 12) & 0x3F) | 0x80); 519 buf[2] = static_cast< char >(((val >> 6) & 0x3F) | 0x80); 520 buf[3] = static_cast< char >((val & 0x3F) | 0x80); 521 len = 4; 522 } 523 pad_.addEphemeral(buf, len); 524 return position; 525 } else { 526 struct EntityRef { 527 char const * inBegin; 528 sal_Int32 inLength; 529 char const * outBegin; 530 sal_Int32 outLength; 531 }; 532 static EntityRef const refs[] = { 533 { RTL_CONSTASCII_STRINGPARAM("amp;"), 534 RTL_CONSTASCII_STRINGPARAM("&") }, 535 { RTL_CONSTASCII_STRINGPARAM("lt;"), 536 RTL_CONSTASCII_STRINGPARAM("<") }, 537 { RTL_CONSTASCII_STRINGPARAM("gt;"), 538 RTL_CONSTASCII_STRINGPARAM(">") }, 539 { RTL_CONSTASCII_STRINGPARAM("apos;"), 540 RTL_CONSTASCII_STRINGPARAM("'") }, 541 { RTL_CONSTASCII_STRINGPARAM("quot;"), 542 RTL_CONSTASCII_STRINGPARAM("\"") } }; 543 for (std::size_t i = 0; i < sizeof refs / sizeof refs[0]; ++i) { 544 if (rtl_str_shortenedCompare_WithLength( 545 position, end - position, refs[i].inBegin, refs[i].inLength, 546 refs[i].inLength) == 547 0) 548 { 549 position += refs[i].inLength; 550 pad_.add(refs[i].outBegin, refs[i].outLength); 551 return position; 552 } 553 } 554 throw css::uno::RuntimeException( 555 (rtl::OUString( 556 RTL_CONSTASCII_USTRINGPARAM("unknown entity reference in ")) + 557 fileUrl_), 558 css::uno::Reference< css::uno::XInterface >()); 559 } 560 } 561 562 Span XmlReader::handleAttributeValue( 563 char const * begin, char const * end, bool fullyNormalize) 564 { 565 pad_.clear(); 566 if (fullyNormalize) { 567 while (begin != end && isSpace(*begin)) { 568 ++begin; 569 } 570 while (end != begin && isSpace(end[-1])) { 571 --end; 572 } 573 char const * p = begin; 574 enum Space { SPACE_NONE, SPACE_SPAN, SPACE_BREAK }; 575 // a single true space character can go into the current span, 576 // everything else breaks the span 577 Space space = SPACE_NONE; 578 while (p != end) { 579 switch (*p) { 580 case '\x09': 581 case '\x0A': 582 case '\x0D': 583 switch (space) { 584 case SPACE_NONE: 585 pad_.add(begin, p - begin); 586 pad_.add(RTL_CONSTASCII_STRINGPARAM(" ")); 587 space = SPACE_BREAK; 588 break; 589 case SPACE_SPAN: 590 pad_.add(begin, p - begin); 591 space = SPACE_BREAK; 592 break; 593 case SPACE_BREAK: 594 break; 595 } 596 begin = ++p; 597 break; 598 case ' ': 599 switch (space) { 600 case SPACE_NONE: 601 ++p; 602 space = SPACE_SPAN; 603 break; 604 case SPACE_SPAN: 605 pad_.add(begin, p - begin); 606 begin = ++p; 607 space = SPACE_BREAK; 608 break; 609 case SPACE_BREAK: 610 begin = ++p; 611 break; 612 } 613 break; 614 case '&': 615 pad_.add(begin, p - begin); 616 p = handleReference(p, end); 617 begin = p; 618 space = SPACE_NONE; 619 break; 620 default: 621 ++p; 622 space = SPACE_NONE; 623 break; 624 } 625 } 626 pad_.add(begin, p - begin); 627 } else { 628 char const * p = begin; 629 while (p != end) { 630 switch (*p) { 631 case '\x09': 632 case '\x0A': 633 pad_.add(begin, p - begin); 634 begin = ++p; 635 pad_.add(RTL_CONSTASCII_STRINGPARAM(" ")); 636 break; 637 case '\x0D': 638 pad_.add(begin, p - begin); 639 ++p; 640 if (peek() == '\x0A') { 641 ++p; 642 } 643 begin = p; 644 pad_.add(RTL_CONSTASCII_STRINGPARAM(" ")); 645 break; 646 case '&': 647 pad_.add(begin, p - begin); 648 p = handleReference(p, end); 649 begin = p; 650 break; 651 default: 652 ++p; 653 break; 654 } 655 } 656 pad_.add(begin, p - begin); 657 } 658 return pad_.get(); 659 } 660 661 XmlReader::Result XmlReader::handleStartTag(int * nsId, Span * localName) { 662 OSL_ASSERT(nsId != 0 && localName); 663 char const * nameBegin = pos_; 664 char const * nameColon = 0; 665 if (!scanName(&nameColon)) { 666 throw css::uno::RuntimeException( 667 (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("bad tag name in ")) + 668 fileUrl_), 669 css::uno::Reference< css::uno::XInterface >()); 670 } 671 char const * nameEnd = pos_; 672 NamespaceList::size_type inheritedNamespaces = namespaces_.size(); 673 bool hasDefaultNs = false; 674 int defaultNsId = NAMESPACE_NONE; 675 attributes_.clear(); 676 for (;;) { 677 char const * p = pos_; 678 skipSpace(); 679 if (peek() == '/' || peek() == '>') { 680 break; 681 } 682 if (pos_ == p) { 683 throw css::uno::RuntimeException( 684 (rtl::OUString( 685 RTL_CONSTASCII_USTRINGPARAM( 686 "missing whitespace before attribute in ")) + 687 fileUrl_), 688 css::uno::Reference< css::uno::XInterface >()); 689 } 690 char const * attrNameBegin = pos_; 691 char const * attrNameColon = 0; 692 if (!scanName(&attrNameColon)) { 693 throw css::uno::RuntimeException( 694 (rtl::OUString( 695 RTL_CONSTASCII_USTRINGPARAM("bad attribute name in ")) + 696 fileUrl_), 697 css::uno::Reference< css::uno::XInterface >()); 698 } 699 char const * attrNameEnd = pos_; 700 skipSpace(); 701 if (read() != '=') { 702 throw css::uno::RuntimeException( 703 (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("missing '=' in ")) + 704 fileUrl_), 705 css::uno::Reference< css::uno::XInterface >()); 706 } 707 skipSpace(); 708 char del = read(); 709 if (del != '\'' && del != '"') { 710 throw css::uno::RuntimeException( 711 (rtl::OUString( 712 RTL_CONSTASCII_USTRINGPARAM("bad attribute value in ")) + 713 fileUrl_), 714 css::uno::Reference< css::uno::XInterface >()); 715 } 716 char const * valueBegin = pos_; 717 sal_Int32 i = rtl_str_indexOfChar_WithLength(pos_, end_ - pos_, del); 718 if (i < 0) { 719 throw css::uno::RuntimeException( 720 (rtl::OUString( 721 RTL_CONSTASCII_USTRINGPARAM( 722 "unterminated attribute value in ")) + 723 fileUrl_), 724 css::uno::Reference< css::uno::XInterface >()); 725 } 726 char const * valueEnd = pos_ + i; 727 pos_ += i + 1; 728 if (attrNameColon == 0 && 729 Span(attrNameBegin, attrNameEnd - attrNameBegin).equals( 730 RTL_CONSTASCII_STRINGPARAM("xmlns"))) 731 { 732 hasDefaultNs = true; 733 defaultNsId = scanNamespaceIri(valueBegin, valueEnd); 734 } else if (attrNameColon != 0 && 735 Span(attrNameBegin, attrNameColon - attrNameBegin).equals( 736 RTL_CONSTASCII_STRINGPARAM("xmlns"))) 737 { 738 namespaces_.push_back( 739 NamespaceData( 740 Span(attrNameColon + 1, attrNameEnd - (attrNameColon + 1)), 741 scanNamespaceIri(valueBegin, valueEnd))); 742 } else { 743 attributes_.push_back( 744 AttributeData( 745 attrNameBegin, attrNameEnd, attrNameColon, valueBegin, 746 valueEnd)); 747 } 748 } 749 if (!hasDefaultNs && !elements_.empty()) { 750 defaultNsId = elements_.top().defaultNamespaceId; 751 } 752 firstAttribute_ = true; 753 if (peek() == '/') { 754 state_ = STATE_EMPTY_ELEMENT_TAG; 755 ++pos_; 756 } else { 757 state_ = STATE_CONTENT; 758 } 759 if (peek() != '>') { 760 throw css::uno::RuntimeException( 761 (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("missing '>' in ")) + 762 fileUrl_), 763 css::uno::Reference< css::uno::XInterface >()); 764 } 765 ++pos_; 766 elements_.push( 767 ElementData( 768 Span(nameBegin, nameEnd - nameBegin), inheritedNamespaces, 769 defaultNsId)); 770 if (nameColon == 0) { 771 *nsId = defaultNsId; 772 *localName = Span(nameBegin, nameEnd - nameBegin); 773 } else { 774 *nsId = getNamespaceId(Span(nameBegin, nameColon - nameBegin)); 775 *localName = Span(nameColon + 1, nameEnd - (nameColon + 1)); 776 } 777 return RESULT_BEGIN; 778 } 779 780 XmlReader::Result XmlReader::handleEndTag() { 781 if (elements_.empty()) { 782 throw css::uno::RuntimeException( 783 (rtl::OUString( 784 RTL_CONSTASCII_USTRINGPARAM("spurious end tag in ")) + 785 fileUrl_), 786 css::uno::Reference< css::uno::XInterface >()); 787 } 788 char const * nameBegin = pos_; 789 char const * nameColon = 0; 790 if (!scanName(&nameColon) || 791 !elements_.top().name.equals(nameBegin, pos_ - nameBegin)) 792 { 793 throw css::uno::RuntimeException( 794 (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("tag mismatch in ")) + 795 fileUrl_), 796 css::uno::Reference< css::uno::XInterface >()); 797 } 798 handleElementEnd(); 799 skipSpace(); 800 if (peek() != '>') { 801 throw css::uno::RuntimeException( 802 (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("missing '>' in ")) + 803 fileUrl_), 804 css::uno::Reference< css::uno::XInterface >()); 805 } 806 ++pos_; 807 return RESULT_END; 808 } 809 810 void XmlReader::handleElementEnd() { 811 OSL_ASSERT(!elements_.empty()); 812 namespaces_.resize(elements_.top().inheritedNamespaces); 813 elements_.pop(); 814 state_ = elements_.empty() ? STATE_DONE : STATE_CONTENT; 815 } 816 817 XmlReader::Result XmlReader::handleSkippedText(Span * data, int * nsId) { 818 for (;;) { 819 sal_Int32 i = rtl_str_indexOfChar_WithLength(pos_, end_ - pos_, '<'); 820 if (i < 0) { 821 throw css::uno::RuntimeException( 822 (rtl::OUString( 823 RTL_CONSTASCII_USTRINGPARAM("premature end of ")) + 824 fileUrl_), 825 css::uno::Reference< css::uno::XInterface >()); 826 } 827 pos_ += i + 1; 828 switch (peek()) { 829 case '!': 830 ++pos_; 831 if (!skipComment() && !scanCdataSection().is()) { 832 skipDocumentTypeDeclaration(); 833 } 834 break; 835 case '/': 836 ++pos_; 837 return handleEndTag(); 838 case '?': 839 ++pos_; 840 skipProcessingInstruction(); 841 break; 842 default: 843 return handleStartTag(nsId, data); 844 } 845 } 846 } 847 848 XmlReader::Result XmlReader::handleRawText(Span * text) { 849 pad_.clear(); 850 for (char const * begin = pos_;;) { 851 switch (peek()) { 852 case '\0': // i.e., EOF 853 throw css::uno::RuntimeException( 854 (rtl::OUString( 855 RTL_CONSTASCII_USTRINGPARAM("premature end of ")) + 856 fileUrl_), 857 css::uno::Reference< css::uno::XInterface >()); 858 case '\x0D': 859 pad_.add(begin, pos_ - begin); 860 ++pos_; 861 if (peek() != '\x0A') { 862 pad_.add(RTL_CONSTASCII_STRINGPARAM("\x0A")); 863 } 864 begin = pos_; 865 break; 866 case '&': 867 pad_.add(begin, pos_ - begin); 868 pos_ = handleReference(pos_, end_); 869 begin = pos_; 870 break; 871 case '<': 872 pad_.add(begin, pos_ - begin); 873 ++pos_; 874 switch (peek()) { 875 case '!': 876 ++pos_; 877 if (!skipComment()) { 878 Span cdata(scanCdataSection()); 879 if (cdata.is()) { 880 normalizeLineEnds(cdata); 881 } else { 882 skipDocumentTypeDeclaration(); 883 } 884 } 885 begin = pos_; 886 break; 887 case '/': 888 *text = pad_.get(); 889 ++pos_; 890 state_ = STATE_END_TAG; 891 return RESULT_TEXT; 892 case '?': 893 ++pos_; 894 skipProcessingInstruction(); 895 begin = pos_; 896 break; 897 default: 898 *text = pad_.get(); 899 state_ = STATE_START_TAG; 900 return RESULT_TEXT; 901 } 902 break; 903 default: 904 ++pos_; 905 break; 906 } 907 } 908 } 909 910 XmlReader::Result XmlReader::handleNormalizedText(Span * text) { 911 pad_.clear(); 912 char const * flowBegin = pos_; 913 char const * flowEnd = pos_; 914 enum Space { SPACE_START, SPACE_NONE, SPACE_SPAN, SPACE_BREAK }; 915 // a single true space character can go into the current flow, 916 // everything else breaks the flow 917 Space space = SPACE_START; 918 for (;;) { 919 switch (peek()) { 920 case '\0': // i.e., EOF 921 throw css::uno::RuntimeException( 922 (rtl::OUString( 923 RTL_CONSTASCII_USTRINGPARAM("premature end of ")) + 924 fileUrl_), 925 css::uno::Reference< css::uno::XInterface >()); 926 case '\x09': 927 case '\x0A': 928 case '\x0D': 929 switch (space) { 930 case SPACE_START: 931 case SPACE_BREAK: 932 break; 933 case SPACE_NONE: 934 case SPACE_SPAN: 935 space = SPACE_BREAK; 936 break; 937 } 938 ++pos_; 939 break; 940 case ' ': 941 switch (space) { 942 case SPACE_START: 943 case SPACE_BREAK: 944 break; 945 case SPACE_NONE: 946 space = SPACE_SPAN; 947 break; 948 case SPACE_SPAN: 949 space = SPACE_BREAK; 950 break; 951 } 952 ++pos_; 953 break; 954 case '&': 955 switch (space) { 956 case SPACE_START: 957 break; 958 case SPACE_NONE: 959 case SPACE_SPAN: 960 pad_.add(flowBegin, pos_ - flowBegin); 961 break; 962 case SPACE_BREAK: 963 pad_.add(flowBegin, flowEnd - flowBegin); 964 pad_.add(RTL_CONSTASCII_STRINGPARAM(" ")); 965 break; 966 } 967 pos_ = handleReference(pos_, end_); 968 flowBegin = pos_; 969 flowEnd = pos_; 970 space = SPACE_NONE; 971 break; 972 case '<': 973 ++pos_; 974 switch (peek()) { 975 case '!': 976 ++pos_; 977 if (skipComment()) { 978 space = SPACE_BREAK; 979 } else { 980 Span cdata(scanCdataSection()); 981 if (cdata.is()) { 982 // CDATA is not normalized (similar to character 983 // references; it keeps the code simple), but it might 984 // arguably be better to normalize it: 985 switch (space) { 986 case SPACE_START: 987 break; 988 case SPACE_NONE: 989 case SPACE_SPAN: 990 pad_.add(flowBegin, pos_ - flowBegin); 991 break; 992 case SPACE_BREAK: 993 pad_.add(flowBegin, flowEnd - flowBegin); 994 pad_.add(RTL_CONSTASCII_STRINGPARAM(" ")); 995 break; 996 } 997 normalizeLineEnds(cdata); 998 flowBegin = pos_; 999 flowEnd = pos_; 1000 space = SPACE_NONE; 1001 } else { 1002 skipDocumentTypeDeclaration(); 1003 } 1004 } 1005 break; 1006 case '/': 1007 ++pos_; 1008 pad_.add(flowBegin, flowEnd - flowBegin); 1009 *text = pad_.get(); 1010 state_ = STATE_END_TAG; 1011 return RESULT_TEXT; 1012 case '?': 1013 ++pos_; 1014 skipProcessingInstruction(); 1015 space = SPACE_BREAK; 1016 break; 1017 default: 1018 pad_.add(flowBegin, flowEnd - flowBegin); 1019 *text = pad_.get(); 1020 state_ = STATE_START_TAG; 1021 return RESULT_TEXT; 1022 } 1023 break; 1024 default: 1025 switch (space) { 1026 case SPACE_START: 1027 flowBegin = pos_; 1028 break; 1029 case SPACE_NONE: 1030 case SPACE_SPAN: 1031 break; 1032 case SPACE_BREAK: 1033 pad_.add(flowBegin, flowEnd - flowBegin); 1034 pad_.add(RTL_CONSTASCII_STRINGPARAM(" ")); 1035 flowBegin = pos_; 1036 break; 1037 } 1038 flowEnd = ++pos_; 1039 space = SPACE_NONE; 1040 break; 1041 } 1042 } 1043 } 1044 1045 int XmlReader::toNamespaceId(NamespaceIris::size_type pos) { 1046 OSL_ASSERT(pos <= INT_MAX); 1047 return static_cast< int >(pos); 1048 } 1049 1050 } 1051