1 /************************************************************************* 2 * 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * Copyright 2000, 2010 Oracle and/or its affiliates. 6 * 7 * OpenOffice.org - a multi-platform office productivity suite 8 * 9 * This file is part of OpenOffice.org. 10 * 11 * OpenOffice.org is free software: you can redistribute it and/or modify 12 * it under the terms of the GNU Lesser General Public License version 3 13 * only, as published by the Free Software Foundation. 14 * 15 * OpenOffice.org is distributed in the hope that it will be useful, 16 * but WITHOUT ANY WARRANTY; without even the implied warranty of 17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 * GNU Lesser General Public License version 3 for more details 19 * (a copy is included in the LICENSE file that accompanied this code). 20 * 21 * You should have received a copy of the GNU Lesser General Public License 22 * version 3 along with OpenOffice.org. If not, see 23 * <http://www.openoffice.org/license.html> 24 * for a copy of the LGPLv3 License. 25 * 26 ************************************************************************/ 27 28 #include "precompiled_xmlreader.hxx" 29 #include "sal/config.h" 30 31 #include <climits> 32 #include <cstddef> 33 34 #include "com/sun/star/container/NoSuchElementException.hpp" 35 #include "com/sun/star/uno/Reference.hxx" 36 #include "com/sun/star/uno/RuntimeException.hpp" 37 #include "com/sun/star/uno/XInterface.hpp" 38 #include "osl/diagnose.h" 39 #include "osl/file.h" 40 #include "rtl/string.h" 41 #include "rtl/ustring.h" 42 #include "rtl/ustring.hxx" 43 #include "sal/types.h" 44 #include "xmlreader/pad.hxx" 45 #include "xmlreader/span.hxx" 46 #include "xmlreader/xmlreader.hxx" 47 48 namespace xmlreader { 49 50 namespace { 51 52 namespace css = com::sun::star; 53 54 bool isSpace(char c) { 55 switch (c) { 56 case '\x09': 57 case '\x0A': 58 case '\x0D': 59 case ' ': 60 return true; 61 default: 62 return false; 63 } 64 } 65 66 } 67 68 XmlReader::XmlReader(rtl::OUString const & fileUrl) 69 SAL_THROW(( 70 css::container::NoSuchElementException, css::uno::RuntimeException)): 71 fileUrl_(fileUrl) 72 { 73 switch (osl_openFile(fileUrl_.pData, &fileHandle_, osl_File_OpenFlag_Read)) 74 { 75 case osl_File_E_None: 76 break; 77 case osl_File_E_NOENT: 78 throw css::container::NoSuchElementException( 79 fileUrl_, css::uno::Reference< css::uno::XInterface >()); 80 default: 81 throw css::uno::RuntimeException( 82 (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("cannot open ")) + 83 fileUrl_), 84 css::uno::Reference< css::uno::XInterface >()); 85 } 86 oslFileError e = osl_getFileSize(fileHandle_, &fileSize_); 87 if (e == osl_File_E_None) { 88 e = osl_mapFile( 89 fileHandle_, &fileAddress_, fileSize_, 0, 90 osl_File_MapFlag_WillNeed); 91 } 92 if (e != osl_File_E_None) { 93 e = osl_closeFile(fileHandle_); 94 if (e != osl_File_E_None) { 95 OSL_TRACE("osl_closeFile failed with %ld", static_cast< long >(e)); 96 } 97 throw css::uno::RuntimeException( 98 (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("cannot mmap ")) + 99 fileUrl_), 100 css::uno::Reference< css::uno::XInterface >()); 101 } 102 namespaceIris_.push_back( 103 Span( 104 RTL_CONSTASCII_STRINGPARAM( 105 "http://www.w3.org/XML/1998/namespace"))); 106 namespaces_.push_back( 107 NamespaceData(Span(RTL_CONSTASCII_STRINGPARAM("xml")), NAMESPACE_XML)); 108 pos_ = static_cast< char * >(fileAddress_); 109 end_ = pos_ + fileSize_; 110 state_ = STATE_CONTENT; 111 } 112 113 XmlReader::~XmlReader() { 114 oslFileError e = osl_unmapFile(fileAddress_, fileSize_); 115 if (e != osl_File_E_None) { 116 OSL_TRACE("osl_unmapFile failed with %ld", static_cast< long >(e)); 117 } 118 e = osl_closeFile(fileHandle_); 119 if (e != osl_File_E_None) { 120 OSL_TRACE("osl_closeFile failed with %ld", static_cast< long >(e)); 121 } 122 } 123 124 int XmlReader::registerNamespaceIri(Span const & iri) { 125 int id = toNamespaceId(namespaceIris_.size()); 126 namespaceIris_.push_back(iri); 127 if (iri.equals( 128 Span( 129 RTL_CONSTASCII_STRINGPARAM( 130 "http://www.w3.org/2001/XMLSchema-instance")))) 131 { 132 // Old user layer .xcu files used the xsi namespace prefix without 133 // declaring a corresponding namespace binding, see issue 77174; reading 134 // those files during migration would fail without this hack that can be 135 // removed once migration is no longer relevant (see 136 // configmgr::Components::parseModificationLayer): 137 namespaces_.push_back( 138 NamespaceData(Span(RTL_CONSTASCII_STRINGPARAM("xsi")), id)); 139 } 140 return id; 141 } 142 143 XmlReader::Result XmlReader::nextItem(Text reportText, Span * data, int * nsId) 144 { 145 switch (state_) { 146 case STATE_CONTENT: 147 switch (reportText) { 148 case TEXT_NONE: 149 return handleSkippedText(data, nsId); 150 case TEXT_RAW: 151 return handleRawText(data); 152 case TEXT_NORMALIZED: 153 return handleNormalizedText(data); 154 } 155 case STATE_START_TAG: 156 return handleStartTag(nsId, data); 157 case STATE_END_TAG: 158 return handleEndTag(); 159 case STATE_EMPTY_ELEMENT_TAG: 160 handleElementEnd(); 161 return RESULT_END; 162 default: // STATE_DONE 163 return RESULT_DONE; 164 } 165 } 166 167 bool XmlReader::nextAttribute(int * nsId, Span * localName) { 168 OSL_ASSERT(nsId != 0 && localName != 0); 169 if (firstAttribute_) { 170 currentAttribute_ = attributes_.begin(); 171 firstAttribute_ = false; 172 } else { 173 ++currentAttribute_; 174 } 175 if (currentAttribute_ == attributes_.end()) { 176 return false; 177 } 178 if (currentAttribute_->nameColon == 0) { 179 *nsId = NAMESPACE_NONE; 180 *localName = Span( 181 currentAttribute_->nameBegin, 182 currentAttribute_->nameEnd - currentAttribute_->nameBegin); 183 } else { 184 *nsId = getNamespaceId( 185 Span( 186 currentAttribute_->nameBegin, 187 currentAttribute_->nameColon - currentAttribute_->nameBegin)); 188 *localName = Span( 189 currentAttribute_->nameColon + 1, 190 currentAttribute_->nameEnd - (currentAttribute_->nameColon + 1)); 191 } 192 return true; 193 } 194 195 Span XmlReader::getAttributeValue(bool fullyNormalize) { 196 return handleAttributeValue( 197 currentAttribute_->valueBegin, currentAttribute_->valueEnd, 198 fullyNormalize); 199 } 200 201 int XmlReader::getNamespaceId(Span const & prefix) const { 202 for (NamespaceList::const_reverse_iterator i(namespaces_.rbegin()); 203 i != namespaces_.rend(); ++i) 204 { 205 if (prefix.equals(i->prefix)) { 206 return i->nsId; 207 } 208 } 209 return NAMESPACE_UNKNOWN; 210 } 211 212 rtl::OUString XmlReader::getUrl() const { 213 return fileUrl_; 214 } 215 216 void XmlReader::normalizeLineEnds(Span const & text) { 217 char const * p = text.begin; 218 sal_Int32 n = text.length; 219 for (;;) { 220 sal_Int32 i = rtl_str_indexOfChar_WithLength(p, n, '\x0D'); 221 if (i < 0) { 222 break; 223 } 224 pad_.add(p, i); 225 p += i + 1; 226 n -= i + 1; 227 if (n == 0 || *p != '\x0A') { 228 pad_.add(RTL_CONSTASCII_STRINGPARAM("\x0A")); 229 } 230 } 231 pad_.add(p, n); 232 } 233 234 void XmlReader::skipSpace() { 235 while (isSpace(peek())) { 236 ++pos_; 237 } 238 } 239 240 bool XmlReader::skipComment() { 241 if (rtl_str_shortenedCompare_WithLength( 242 pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("--"), 243 RTL_CONSTASCII_LENGTH("--")) != 244 0) 245 { 246 return false; 247 } 248 pos_ += RTL_CONSTASCII_LENGTH("--"); 249 sal_Int32 i = rtl_str_indexOfStr_WithLength( 250 pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("--")); 251 if (i < 0) { 252 throw css::uno::RuntimeException( 253 (rtl::OUString( 254 RTL_CONSTASCII_USTRINGPARAM( 255 "premature end (within comment) of ")) + 256 fileUrl_), 257 css::uno::Reference< css::uno::XInterface >()); 258 } 259 pos_ += i + RTL_CONSTASCII_LENGTH("--"); 260 if (read() != '>') { 261 throw css::uno::RuntimeException( 262 (rtl::OUString( 263 RTL_CONSTASCII_USTRINGPARAM( 264 "illegal \"--\" within comment in ")) + 265 fileUrl_), 266 css::uno::Reference< css::uno::XInterface >()); 267 } 268 return true; 269 } 270 271 void XmlReader::skipProcessingInstruction() { 272 sal_Int32 i = rtl_str_indexOfStr_WithLength( 273 pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("?>")); 274 if (i < 0) { 275 throw css::uno::RuntimeException( 276 (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("bad '<?' in ")) + 277 fileUrl_), 278 css::uno::Reference< css::uno::XInterface >()); 279 } 280 pos_ += i + RTL_CONSTASCII_LENGTH("?>"); 281 } 282 283 void XmlReader::skipDocumentTypeDeclaration() { 284 // Neither is it checked that the doctypedecl is at the correct position in 285 // the document, nor that it is well-formed: 286 for (;;) { 287 char c = read(); 288 switch (c) { 289 case '\0': // i.e., EOF 290 throw css::uno::RuntimeException( 291 (rtl::OUString( 292 RTL_CONSTASCII_USTRINGPARAM( 293 "premature end (within DTD) of ")) + 294 fileUrl_), 295 css::uno::Reference< css::uno::XInterface >()); 296 case '"': 297 case '\'': 298 { 299 sal_Int32 i = rtl_str_indexOfChar_WithLength( 300 pos_, end_ - pos_, c); 301 if (i < 0) { 302 throw css::uno::RuntimeException( 303 (rtl::OUString( 304 RTL_CONSTASCII_USTRINGPARAM( 305 "premature end (within DTD) of ")) + 306 fileUrl_), 307 css::uno::Reference< css::uno::XInterface >()); 308 } 309 pos_ += i + 1; 310 } 311 break; 312 case '>': 313 return; 314 case '[': 315 for (;;) { 316 c = read(); 317 switch (c) { 318 case '\0': // i.e., EOF 319 throw css::uno::RuntimeException( 320 (rtl::OUString( 321 RTL_CONSTASCII_USTRINGPARAM( 322 "premature end (within DTD) of ")) + 323 fileUrl_), 324 css::uno::Reference< css::uno::XInterface >()); 325 case '"': 326 case '\'': 327 { 328 sal_Int32 i = rtl_str_indexOfChar_WithLength( 329 pos_, end_ - pos_, c); 330 if (i < 0) { 331 throw css::uno::RuntimeException( 332 (rtl::OUString( 333 RTL_CONSTASCII_USTRINGPARAM( 334 "premature end (within DTD) of ")) + 335 fileUrl_), 336 css::uno::Reference< css::uno::XInterface >()); 337 } 338 pos_ += i + 1; 339 } 340 break; 341 case '<': 342 switch (read()) { 343 case '\0': // i.e., EOF 344 throw css::uno::RuntimeException( 345 (rtl::OUString( 346 RTL_CONSTASCII_USTRINGPARAM( 347 "premature end (within DTD) of ")) + 348 fileUrl_), 349 css::uno::Reference< css::uno::XInterface >()); 350 case '!': 351 skipComment(); 352 break; 353 case '?': 354 skipProcessingInstruction(); 355 break; 356 default: 357 break; 358 } 359 break; 360 case ']': 361 skipSpace(); 362 if (read() != '>') { 363 throw css::uno::RuntimeException( 364 (rtl::OUString( 365 RTL_CONSTASCII_USTRINGPARAM( 366 "missing \">\" of DTD in ")) + 367 fileUrl_), 368 css::uno::Reference< css::uno::XInterface >()); 369 } 370 return; 371 default: 372 break; 373 } 374 } 375 default: 376 break; 377 } 378 } 379 } 380 381 Span XmlReader::scanCdataSection() { 382 if (rtl_str_shortenedCompare_WithLength( 383 pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("[CDATA["), 384 RTL_CONSTASCII_LENGTH("[CDATA[")) != 385 0) 386 { 387 return Span(); 388 } 389 pos_ += RTL_CONSTASCII_LENGTH("[CDATA["); 390 char const * begin = pos_; 391 sal_Int32 i = rtl_str_indexOfStr_WithLength( 392 pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("]]>")); 393 if (i < 0) { 394 throw css::uno::RuntimeException( 395 (rtl::OUString( 396 RTL_CONSTASCII_USTRINGPARAM( 397 "premature end (within CDATA section) of ")) + 398 fileUrl_), 399 css::uno::Reference< css::uno::XInterface >()); 400 } 401 pos_ += i + RTL_CONSTASCII_LENGTH("]]>"); 402 return Span(begin, i); 403 } 404 405 bool XmlReader::scanName(char const ** nameColon) { 406 OSL_ASSERT(nameColon != 0 && *nameColon == 0); 407 for (char const * begin = pos_;; ++pos_) { 408 switch (peek()) { 409 case '\0': // i.e., EOF 410 case '\x09': 411 case '\x0A': 412 case '\x0D': 413 case ' ': 414 case '/': 415 case '=': 416 case '>': 417 return pos_ != begin; 418 case ':': 419 *nameColon = pos_; 420 break; 421 default: 422 break; 423 } 424 } 425 } 426 427 int XmlReader::scanNamespaceIri(char const * begin, char const * end) { 428 OSL_ASSERT(begin != 0 && begin <= end); 429 Span iri(handleAttributeValue(begin, end, false)); 430 for (NamespaceIris::size_type i = 0; i < namespaceIris_.size(); ++i) { 431 if (namespaceIris_[i].equals(iri)) { 432 return toNamespaceId(i); 433 } 434 } 435 return XmlReader::NAMESPACE_UNKNOWN; 436 } 437 438 char const * XmlReader::handleReference(char const * position, char const * end) 439 { 440 OSL_ASSERT(position != 0 && *position == '&' && position < end); 441 ++position; 442 if (*position == '#') { 443 ++position; 444 sal_Int32 val = 0; 445 char const * p; 446 if (*position == 'x') { 447 ++position; 448 p = position; 449 for (;; ++position) { 450 char c = *position; 451 if (c >= '0' && c <= '9') { 452 val = 16 * val + (c - '0'); 453 } else if (c >= 'A' && c <= 'F') { 454 val = 16 * val + (c - 'A') + 10; 455 } else if (c >= 'a' && c <= 'f') { 456 val = 16 * val + (c - 'a') + 10; 457 } else { 458 break; 459 } 460 if (val > 0x10FFFF) { // avoid overflow 461 throw css::uno::RuntimeException( 462 (rtl::OUString( 463 RTL_CONSTASCII_USTRINGPARAM( 464 "'&#x...' too large in ")) + 465 fileUrl_), 466 css::uno::Reference< css::uno::XInterface >()); 467 } 468 } 469 } else { 470 p = position; 471 for (;; ++position) { 472 char c = *position; 473 if (c >= '0' && c <= '9') { 474 val = 10 * val + (c - '0'); 475 } else { 476 break; 477 } 478 if (val > 0x10FFFF) { // avoid overflow 479 throw css::uno::RuntimeException( 480 (rtl::OUString( 481 RTL_CONSTASCII_USTRINGPARAM( 482 "'&#...' too large in ")) + 483 fileUrl_), 484 css::uno::Reference< css::uno::XInterface >()); 485 } 486 } 487 } 488 if (position == p || *position++ != ';') { 489 throw css::uno::RuntimeException( 490 (rtl::OUString( 491 RTL_CONSTASCII_USTRINGPARAM("'&#...' missing ';' in ")) + 492 fileUrl_), 493 css::uno::Reference< css::uno::XInterface >()); 494 } 495 OSL_ASSERT(val >= 0 && val <= 0x10FFFF); 496 if ((val < 0x20 && val != 0x9 && val != 0xA && val != 0xD) || 497 (val >= 0xD800 && val <= 0xDFFF) || val == 0xFFFE || val == 0xFFFF) 498 { 499 throw css::uno::RuntimeException( 500 (rtl::OUString( 501 RTL_CONSTASCII_USTRINGPARAM( 502 "character reference denoting invalid character in ")) + 503 fileUrl_), 504 css::uno::Reference< css::uno::XInterface >()); 505 } 506 char buf[4]; 507 sal_Int32 len; 508 if (val < 0x80) { 509 buf[0] = static_cast< char >(val); 510 len = 1; 511 } else if (val < 0x800) { 512 buf[0] = static_cast< char >((val >> 6) | 0xC0); 513 buf[1] = static_cast< char >((val & 0x3F) | 0x80); 514 len = 2; 515 } else if (val < 0x10000) { 516 buf[0] = static_cast< char >((val >> 12) | 0xE0); 517 buf[1] = static_cast< char >(((val >> 6) & 0x3F) | 0x80); 518 buf[2] = static_cast< char >((val & 0x3F) | 0x80); 519 len = 3; 520 } else { 521 buf[0] = static_cast< char >((val >> 18) | 0xF0); 522 buf[1] = static_cast< char >(((val >> 12) & 0x3F) | 0x80); 523 buf[2] = static_cast< char >(((val >> 6) & 0x3F) | 0x80); 524 buf[3] = static_cast< char >((val & 0x3F) | 0x80); 525 len = 4; 526 } 527 pad_.addEphemeral(buf, len); 528 return position; 529 } else { 530 struct EntityRef { 531 char const * inBegin; 532 sal_Int32 inLength; 533 char const * outBegin; 534 sal_Int32 outLength; 535 }; 536 static EntityRef const refs[] = { 537 { RTL_CONSTASCII_STRINGPARAM("amp;"), 538 RTL_CONSTASCII_STRINGPARAM("&") }, 539 { RTL_CONSTASCII_STRINGPARAM("lt;"), 540 RTL_CONSTASCII_STRINGPARAM("<") }, 541 { RTL_CONSTASCII_STRINGPARAM("gt;"), 542 RTL_CONSTASCII_STRINGPARAM(">") }, 543 { RTL_CONSTASCII_STRINGPARAM("apos;"), 544 RTL_CONSTASCII_STRINGPARAM("'") }, 545 { RTL_CONSTASCII_STRINGPARAM("quot;"), 546 RTL_CONSTASCII_STRINGPARAM("\"") } }; 547 for (std::size_t i = 0; i < sizeof refs / sizeof refs[0]; ++i) { 548 if (rtl_str_shortenedCompare_WithLength( 549 position, end - position, refs[i].inBegin, refs[i].inLength, 550 refs[i].inLength) == 551 0) 552 { 553 position += refs[i].inLength; 554 pad_.add(refs[i].outBegin, refs[i].outLength); 555 return position; 556 } 557 } 558 throw css::uno::RuntimeException( 559 (rtl::OUString( 560 RTL_CONSTASCII_USTRINGPARAM("unknown entity reference in ")) + 561 fileUrl_), 562 css::uno::Reference< css::uno::XInterface >()); 563 } 564 } 565 566 Span XmlReader::handleAttributeValue( 567 char const * begin, char const * end, bool fullyNormalize) 568 { 569 pad_.clear(); 570 if (fullyNormalize) { 571 while (begin != end && isSpace(*begin)) { 572 ++begin; 573 } 574 while (end != begin && isSpace(end[-1])) { 575 --end; 576 } 577 char const * p = begin; 578 enum Space { SPACE_NONE, SPACE_SPAN, SPACE_BREAK }; 579 // a single true space character can go into the current span, 580 // everything else breaks the span 581 Space space = SPACE_NONE; 582 while (p != end) { 583 switch (*p) { 584 case '\x09': 585 case '\x0A': 586 case '\x0D': 587 switch (space) { 588 case SPACE_NONE: 589 pad_.add(begin, p - begin); 590 pad_.add(RTL_CONSTASCII_STRINGPARAM(" ")); 591 space = SPACE_BREAK; 592 break; 593 case SPACE_SPAN: 594 pad_.add(begin, p - begin); 595 space = SPACE_BREAK; 596 break; 597 case SPACE_BREAK: 598 break; 599 } 600 begin = ++p; 601 break; 602 case ' ': 603 switch (space) { 604 case SPACE_NONE: 605 ++p; 606 space = SPACE_SPAN; 607 break; 608 case SPACE_SPAN: 609 pad_.add(begin, p - begin); 610 begin = ++p; 611 space = SPACE_BREAK; 612 break; 613 case SPACE_BREAK: 614 begin = ++p; 615 break; 616 } 617 break; 618 case '&': 619 pad_.add(begin, p - begin); 620 p = handleReference(p, end); 621 begin = p; 622 space = SPACE_NONE; 623 break; 624 default: 625 ++p; 626 space = SPACE_NONE; 627 break; 628 } 629 } 630 pad_.add(begin, p - begin); 631 } else { 632 char const * p = begin; 633 while (p != end) { 634 switch (*p) { 635 case '\x09': 636 case '\x0A': 637 pad_.add(begin, p - begin); 638 begin = ++p; 639 pad_.add(RTL_CONSTASCII_STRINGPARAM(" ")); 640 break; 641 case '\x0D': 642 pad_.add(begin, p - begin); 643 ++p; 644 if (peek() == '\x0A') { 645 ++p; 646 } 647 begin = p; 648 pad_.add(RTL_CONSTASCII_STRINGPARAM(" ")); 649 break; 650 case '&': 651 pad_.add(begin, p - begin); 652 p = handleReference(p, end); 653 begin = p; 654 break; 655 default: 656 ++p; 657 break; 658 } 659 } 660 pad_.add(begin, p - begin); 661 } 662 return pad_.get(); 663 } 664 665 XmlReader::Result XmlReader::handleStartTag(int * nsId, Span * localName) { 666 OSL_ASSERT(nsId != 0 && localName); 667 char const * nameBegin = pos_; 668 char const * nameColon = 0; 669 if (!scanName(&nameColon)) { 670 throw css::uno::RuntimeException( 671 (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("bad tag name in ")) + 672 fileUrl_), 673 css::uno::Reference< css::uno::XInterface >()); 674 } 675 char const * nameEnd = pos_; 676 NamespaceList::size_type inheritedNamespaces = namespaces_.size(); 677 bool hasDefaultNs = false; 678 int defaultNsId = NAMESPACE_NONE; 679 attributes_.clear(); 680 for (;;) { 681 char const * p = pos_; 682 skipSpace(); 683 if (peek() == '/' || peek() == '>') { 684 break; 685 } 686 if (pos_ == p) { 687 throw css::uno::RuntimeException( 688 (rtl::OUString( 689 RTL_CONSTASCII_USTRINGPARAM( 690 "missing whitespace before attribute in ")) + 691 fileUrl_), 692 css::uno::Reference< css::uno::XInterface >()); 693 } 694 char const * attrNameBegin = pos_; 695 char const * attrNameColon = 0; 696 if (!scanName(&attrNameColon)) { 697 throw css::uno::RuntimeException( 698 (rtl::OUString( 699 RTL_CONSTASCII_USTRINGPARAM("bad attribute name in ")) + 700 fileUrl_), 701 css::uno::Reference< css::uno::XInterface >()); 702 } 703 char const * attrNameEnd = pos_; 704 skipSpace(); 705 if (read() != '=') { 706 throw css::uno::RuntimeException( 707 (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("missing '=' in ")) + 708 fileUrl_), 709 css::uno::Reference< css::uno::XInterface >()); 710 } 711 skipSpace(); 712 char del = read(); 713 if (del != '\'' && del != '"') { 714 throw css::uno::RuntimeException( 715 (rtl::OUString( 716 RTL_CONSTASCII_USTRINGPARAM("bad attribute value in ")) + 717 fileUrl_), 718 css::uno::Reference< css::uno::XInterface >()); 719 } 720 char const * valueBegin = pos_; 721 sal_Int32 i = rtl_str_indexOfChar_WithLength(pos_, end_ - pos_, del); 722 if (i < 0) { 723 throw css::uno::RuntimeException( 724 (rtl::OUString( 725 RTL_CONSTASCII_USTRINGPARAM( 726 "unterminated attribute value in ")) + 727 fileUrl_), 728 css::uno::Reference< css::uno::XInterface >()); 729 } 730 char const * valueEnd = pos_ + i; 731 pos_ += i + 1; 732 if (attrNameColon == 0 && 733 Span(attrNameBegin, attrNameEnd - attrNameBegin).equals( 734 RTL_CONSTASCII_STRINGPARAM("xmlns"))) 735 { 736 hasDefaultNs = true; 737 defaultNsId = scanNamespaceIri(valueBegin, valueEnd); 738 } else if (attrNameColon != 0 && 739 Span(attrNameBegin, attrNameColon - attrNameBegin).equals( 740 RTL_CONSTASCII_STRINGPARAM("xmlns"))) 741 { 742 namespaces_.push_back( 743 NamespaceData( 744 Span(attrNameColon + 1, attrNameEnd - (attrNameColon + 1)), 745 scanNamespaceIri(valueBegin, valueEnd))); 746 } else { 747 attributes_.push_back( 748 AttributeData( 749 attrNameBegin, attrNameEnd, attrNameColon, valueBegin, 750 valueEnd)); 751 } 752 } 753 if (!hasDefaultNs && !elements_.empty()) { 754 defaultNsId = elements_.top().defaultNamespaceId; 755 } 756 firstAttribute_ = true; 757 if (peek() == '/') { 758 state_ = STATE_EMPTY_ELEMENT_TAG; 759 ++pos_; 760 } else { 761 state_ = STATE_CONTENT; 762 } 763 if (peek() != '>') { 764 throw css::uno::RuntimeException( 765 (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("missing '>' in ")) + 766 fileUrl_), 767 css::uno::Reference< css::uno::XInterface >()); 768 } 769 ++pos_; 770 elements_.push( 771 ElementData( 772 Span(nameBegin, nameEnd - nameBegin), inheritedNamespaces, 773 defaultNsId)); 774 if (nameColon == 0) { 775 *nsId = defaultNsId; 776 *localName = Span(nameBegin, nameEnd - nameBegin); 777 } else { 778 *nsId = getNamespaceId(Span(nameBegin, nameColon - nameBegin)); 779 *localName = Span(nameColon + 1, nameEnd - (nameColon + 1)); 780 } 781 return RESULT_BEGIN; 782 } 783 784 XmlReader::Result XmlReader::handleEndTag() { 785 if (elements_.empty()) { 786 throw css::uno::RuntimeException( 787 (rtl::OUString( 788 RTL_CONSTASCII_USTRINGPARAM("spurious end tag in ")) + 789 fileUrl_), 790 css::uno::Reference< css::uno::XInterface >()); 791 } 792 char const * nameBegin = pos_; 793 char const * nameColon = 0; 794 if (!scanName(&nameColon) || 795 !elements_.top().name.equals(nameBegin, pos_ - nameBegin)) 796 { 797 throw css::uno::RuntimeException( 798 (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("tag mismatch in ")) + 799 fileUrl_), 800 css::uno::Reference< css::uno::XInterface >()); 801 } 802 handleElementEnd(); 803 skipSpace(); 804 if (peek() != '>') { 805 throw css::uno::RuntimeException( 806 (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("missing '>' in ")) + 807 fileUrl_), 808 css::uno::Reference< css::uno::XInterface >()); 809 } 810 ++pos_; 811 return RESULT_END; 812 } 813 814 void XmlReader::handleElementEnd() { 815 OSL_ASSERT(!elements_.empty()); 816 namespaces_.resize(elements_.top().inheritedNamespaces); 817 elements_.pop(); 818 state_ = elements_.empty() ? STATE_DONE : STATE_CONTENT; 819 } 820 821 XmlReader::Result XmlReader::handleSkippedText(Span * data, int * nsId) { 822 for (;;) { 823 sal_Int32 i = rtl_str_indexOfChar_WithLength(pos_, end_ - pos_, '<'); 824 if (i < 0) { 825 throw css::uno::RuntimeException( 826 (rtl::OUString( 827 RTL_CONSTASCII_USTRINGPARAM("premature end of ")) + 828 fileUrl_), 829 css::uno::Reference< css::uno::XInterface >()); 830 } 831 pos_ += i + 1; 832 switch (peek()) { 833 case '!': 834 ++pos_; 835 if (!skipComment() && !scanCdataSection().is()) { 836 skipDocumentTypeDeclaration(); 837 } 838 break; 839 case '/': 840 ++pos_; 841 return handleEndTag(); 842 case '?': 843 ++pos_; 844 skipProcessingInstruction(); 845 break; 846 default: 847 return handleStartTag(nsId, data); 848 } 849 } 850 } 851 852 XmlReader::Result XmlReader::handleRawText(Span * text) { 853 pad_.clear(); 854 for (char const * begin = pos_;;) { 855 switch (peek()) { 856 case '\0': // i.e., EOF 857 throw css::uno::RuntimeException( 858 (rtl::OUString( 859 RTL_CONSTASCII_USTRINGPARAM("premature end of ")) + 860 fileUrl_), 861 css::uno::Reference< css::uno::XInterface >()); 862 case '\x0D': 863 pad_.add(begin, pos_ - begin); 864 ++pos_; 865 if (peek() != '\x0A') { 866 pad_.add(RTL_CONSTASCII_STRINGPARAM("\x0A")); 867 } 868 begin = pos_; 869 break; 870 case '&': 871 pad_.add(begin, pos_ - begin); 872 pos_ = handleReference(pos_, end_); 873 begin = pos_; 874 break; 875 case '<': 876 pad_.add(begin, pos_ - begin); 877 ++pos_; 878 switch (peek()) { 879 case '!': 880 ++pos_; 881 if (!skipComment()) { 882 Span cdata(scanCdataSection()); 883 if (cdata.is()) { 884 normalizeLineEnds(cdata); 885 } else { 886 skipDocumentTypeDeclaration(); 887 } 888 } 889 begin = pos_; 890 break; 891 case '/': 892 *text = pad_.get(); 893 ++pos_; 894 state_ = STATE_END_TAG; 895 return RESULT_TEXT; 896 case '?': 897 ++pos_; 898 skipProcessingInstruction(); 899 begin = pos_; 900 break; 901 default: 902 *text = pad_.get(); 903 state_ = STATE_START_TAG; 904 return RESULT_TEXT; 905 } 906 break; 907 default: 908 ++pos_; 909 break; 910 } 911 } 912 } 913 914 XmlReader::Result XmlReader::handleNormalizedText(Span * text) { 915 pad_.clear(); 916 char const * flowBegin = pos_; 917 char const * flowEnd = pos_; 918 enum Space { SPACE_START, SPACE_NONE, SPACE_SPAN, SPACE_BREAK }; 919 // a single true space character can go into the current flow, 920 // everything else breaks the flow 921 Space space = SPACE_START; 922 for (;;) { 923 switch (peek()) { 924 case '\0': // i.e., EOF 925 throw css::uno::RuntimeException( 926 (rtl::OUString( 927 RTL_CONSTASCII_USTRINGPARAM("premature end of ")) + 928 fileUrl_), 929 css::uno::Reference< css::uno::XInterface >()); 930 case '\x09': 931 case '\x0A': 932 case '\x0D': 933 switch (space) { 934 case SPACE_START: 935 case SPACE_BREAK: 936 break; 937 case SPACE_NONE: 938 case SPACE_SPAN: 939 space = SPACE_BREAK; 940 break; 941 } 942 ++pos_; 943 break; 944 case ' ': 945 switch (space) { 946 case SPACE_START: 947 case SPACE_BREAK: 948 break; 949 case SPACE_NONE: 950 space = SPACE_SPAN; 951 break; 952 case SPACE_SPAN: 953 space = SPACE_BREAK; 954 break; 955 } 956 ++pos_; 957 break; 958 case '&': 959 switch (space) { 960 case SPACE_START: 961 break; 962 case SPACE_NONE: 963 case SPACE_SPAN: 964 pad_.add(flowBegin, pos_ - flowBegin); 965 break; 966 case SPACE_BREAK: 967 pad_.add(flowBegin, flowEnd - flowBegin); 968 pad_.add(RTL_CONSTASCII_STRINGPARAM(" ")); 969 break; 970 } 971 pos_ = handleReference(pos_, end_); 972 flowBegin = pos_; 973 flowEnd = pos_; 974 space = SPACE_NONE; 975 break; 976 case '<': 977 ++pos_; 978 switch (peek()) { 979 case '!': 980 ++pos_; 981 if (skipComment()) { 982 space = SPACE_BREAK; 983 } else { 984 Span cdata(scanCdataSection()); 985 if (cdata.is()) { 986 // CDATA is not normalized (similar to character 987 // references; it keeps the code simple), but it might 988 // arguably be better to normalize it: 989 switch (space) { 990 case SPACE_START: 991 break; 992 case SPACE_NONE: 993 case SPACE_SPAN: 994 pad_.add(flowBegin, pos_ - flowBegin); 995 break; 996 case SPACE_BREAK: 997 pad_.add(flowBegin, flowEnd - flowBegin); 998 pad_.add(RTL_CONSTASCII_STRINGPARAM(" ")); 999 break; 1000 } 1001 normalizeLineEnds(cdata); 1002 flowBegin = pos_; 1003 flowEnd = pos_; 1004 space = SPACE_NONE; 1005 } else { 1006 skipDocumentTypeDeclaration(); 1007 } 1008 } 1009 break; 1010 case '/': 1011 ++pos_; 1012 pad_.add(flowBegin, flowEnd - flowBegin); 1013 *text = pad_.get(); 1014 state_ = STATE_END_TAG; 1015 return RESULT_TEXT; 1016 case '?': 1017 ++pos_; 1018 skipProcessingInstruction(); 1019 space = SPACE_BREAK; 1020 break; 1021 default: 1022 pad_.add(flowBegin, flowEnd - flowBegin); 1023 *text = pad_.get(); 1024 state_ = STATE_START_TAG; 1025 return RESULT_TEXT; 1026 } 1027 break; 1028 default: 1029 switch (space) { 1030 case SPACE_START: 1031 flowBegin = pos_; 1032 break; 1033 case SPACE_NONE: 1034 case SPACE_SPAN: 1035 break; 1036 case SPACE_BREAK: 1037 pad_.add(flowBegin, flowEnd - flowBegin); 1038 pad_.add(RTL_CONSTASCII_STRINGPARAM(" ")); 1039 flowBegin = pos_; 1040 break; 1041 } 1042 flowEnd = ++pos_; 1043 space = SPACE_NONE; 1044 break; 1045 } 1046 } 1047 } 1048 1049 int XmlReader::toNamespaceId(NamespaceIris::size_type pos) { 1050 OSL_ASSERT(pos <= INT_MAX); 1051 return static_cast< int >(pos); 1052 } 1053 1054 } 1055