1 /************************************************************** 2 * 3 * Licensed to the Apache Software Foundation (ASF) under one 4 * or more contributor license agreements. See the NOTICE file 5 * distributed with this work for additional information 6 * regarding copyright ownership. The ASF licenses this file 7 * to you under the Apache License, Version 2.0 (the 8 * "License"); you may not use this file except in compliance 9 * with the License. You may obtain a copy of the License at 10 * 11 * http://www.apache.org/licenses/LICENSE-2.0 12 * 13 * Unless required by applicable law or agreed to in writing, 14 * software distributed under the License is distributed on an 15 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 * KIND, either express or implied. See the License for the 17 * specific language governing permissions and limitations 18 * under the License. 19 * 20 *************************************************************/ 21 22 23 #include <string.h> 24 25 #include <sal/types.h> 26 27 #include <rtl/textenc.h> 28 #include <rtl/tencinfo.h> 29 30 31 #include <com/sun/star/io/XInputStream.hpp> 32 33 using namespace rtl; 34 using namespace ::com::sun::star::uno; 35 using namespace ::com::sun::star::io; 36 37 #include "xml2utf.hxx" 38 39 namespace sax_expatwrap { 40 41 sal_Int32 XMLFile2UTFConverter::readAndConvert( Sequence<sal_Int8> &seq , sal_Int32 nMaxToRead ) 42 throw ( IOException, NotConnectedException , BufferSizeExceededException , RuntimeException ) 43 { 44 45 Sequence<sal_Int8> seqIn; 46 47 if( ! m_in.is() ) { 48 throw NotConnectedException(); 49 } 50 if( ! m_bStarted ) { 51 nMaxToRead = Max( 512 , nMaxToRead ); // it should be possible to find the encoding attribute 52 // within the first 512 bytes == 128 chars in UCS-4 53 } 54 55 sal_Int32 nRead; 56 Sequence< sal_Int8 > seqStart; 57 while( sal_True ) 58 { 59 nRead = m_in->readSomeBytes( seq , nMaxToRead ); 60 61 if( nRead + seqStart.getLength()) 62 { 63 // if nRead is 0, the file is already eof. 64 if( ! m_bStarted && nRead ) 65 { 66 // ensure that enough data is available to parse encoding 67 if( seqStart.getLength() ) 68 { 69 // prefix with what we had so far. 70 sal_Int32 nLength = seq.getLength(); 71 seq.realloc( seqStart.getLength() + nLength ); 72 73 memmove (seq.getArray() + seqStart.getLength(), 74 seq.getConstArray(), 75 nLength); 76 memcpy (seq.getArray(), 77 seqStart.getConstArray(), 78 seqStart.getLength()); 79 } 80 81 // autodetection with the first bytes 82 if( ! isEncodingRecognizable( seq ) ) 83 { 84 // remember what we have so far. 85 seqStart = seq; 86 87 // read more ! 88 continue; 89 } 90 if( scanForEncoding( seq ) || m_sEncoding.getLength() ) { 91 // initialize decoding 92 initializeDecoding(); 93 } 94 nRead = seq.getLength(); 95 seqStart = Sequence < sal_Int8 > (); 96 } 97 98 // do the encoding 99 if( m_pText2Unicode && m_pUnicode2Text && 100 m_pText2Unicode->canContinue() && m_pUnicode2Text->canContinue() ) { 101 102 Sequence<sal_Unicode> seqUnicode = m_pText2Unicode->convert( seq ); 103 seq = m_pUnicode2Text->convert( seqUnicode.getConstArray(), seqUnicode.getLength() ); 104 } 105 106 if( ! m_bStarted ) 107 { 108 // it must now be ensured, that no encoding attribute exist anymore 109 // ( otherwise the expat-Parser will crash ) 110 // This must be done after decoding ! 111 // ( e.g. Files decoded in ucs-4 cannot be read properly ) 112 m_bStarted = sal_True; 113 removeEncoding( seq ); 114 } 115 nRead = seq.getLength(); 116 } 117 118 break; 119 } 120 return nRead; 121 } 122 123 124 XMLFile2UTFConverter::~XMLFile2UTFConverter() 125 { 126 if( m_pText2Unicode ) 127 delete m_pText2Unicode; 128 if( m_pUnicode2Text ) 129 delete m_pUnicode2Text; 130 } 131 132 133 void XMLFile2UTFConverter::removeEncoding( Sequence<sal_Int8> &seq ) 134 { 135 const sal_Int8 *pSource = seq.getArray(); 136 if( ! strncmp( (const char * ) pSource , "<?xml" , 4) ) 137 { 138 139 // scan for encoding 140 OString str( (sal_Char * ) pSource , seq.getLength() ); 141 142 // cut sequence to first line break 143 // find first line break; 144 int nMax = str.indexOf( 10 ); 145 if( nMax >= 0 ) 146 { 147 str = str.copy( 0 , nMax ); 148 } 149 150 int nFound = str.indexOf( " encoding" ); 151 if( nFound >= 0 ) { 152 int nStop; 153 int nStart = str.indexOf( "\"" , nFound ); 154 if( nStart < 0 || str.indexOf( "'" , nFound ) < nStart ) 155 { 156 nStart = str.indexOf( "'" , nFound ); 157 nStop = str.indexOf( "'" , nStart +1 ); 158 } 159 else 160 { 161 nStop = str.indexOf( "\"" , nStart +1); 162 } 163 164 if( nStart >= 0 && nStop >= 0 && nStart+1 < nStop ) 165 { 166 // remove encoding tag from file 167 memmove( &( seq.getArray()[nFound] ) , 168 &( seq.getArray()[nStop+1]) , 169 seq.getLength() - nStop -1); 170 seq.realloc( seq.getLength() - ( nStop+1 - nFound ) ); 171 // str = String( (char * ) seq.getArray() , seq.getLen() ); 172 } 173 } 174 } 175 } 176 177 // Checks, if enough data has been accumulated to recognize the encoding 178 sal_Bool XMLFile2UTFConverter::isEncodingRecognizable( const Sequence< sal_Int8 > &seq) 179 { 180 const sal_Int8 *pSource = seq.getConstArray(); 181 sal_Bool bCheckIfFirstClosingBracketExsists = sal_False; 182 183 if( seq.getLength() < 8 ) { 184 // no recognition possible, when less than 8 bytes are available 185 return sal_False; 186 } 187 188 if( ! strncmp( (const char * ) pSource , "<?xml" , 4 ) ) { 189 // scan if the <?xml tag finishes within this buffer 190 bCheckIfFirstClosingBracketExsists = sal_True; 191 } 192 else if( ('<' == pSource[0] || '<' == pSource[2] ) && 193 ( ('?' == pSource[4] || '?' == pSource[6] ) ) ) 194 { 195 // check for utf-16 196 bCheckIfFirstClosingBracketExsists = sal_True; 197 } 198 else if( ( '<' == pSource[1] || '<' == pSource[3] ) && 199 ( '?' == pSource[5] || '?' == pSource[7] ) ) 200 { 201 // check for 202 bCheckIfFirstClosingBracketExsists = sal_True; 203 } 204 205 if( bCheckIfFirstClosingBracketExsists ) 206 { 207 for( sal_Int32 i = 0; i < seq.getLength() ; i ++ ) 208 { 209 // whole <?xml tag is valid 210 if( '>' == pSource[ i ] ) 211 { 212 return sal_True; 213 } 214 } 215 return sal_False; 216 } 217 218 // No <? tag in front, no need for a bigger buffer 219 return sal_True; 220 } 221 222 sal_Bool XMLFile2UTFConverter::scanForEncoding( Sequence< sal_Int8 > &seq ) 223 { 224 const sal_uInt8 *pSource = reinterpret_cast<const sal_uInt8*>( seq.getConstArray() ); 225 sal_Bool bReturn = sal_True; 226 227 if( seq.getLength() < 4 ) { 228 // no recognition possible, when less than 4 bytes are available 229 return sal_False; 230 } 231 232 // first level : detect possible file formats 233 if( ! strncmp( (const char * ) pSource , "<?xml" , 4 ) ) { 234 235 // scan for encoding 236 OString str( (const sal_Char *) pSource , seq.getLength() ); 237 238 // cut sequence to first line break 239 //find first line break; 240 int nMax = str.indexOf( 10 ); 241 if( nMax >= 0 ) 242 { 243 str = str.copy( 0 , nMax ); 244 } 245 246 int nFound = str.indexOf( " encoding" ); 247 if( nFound < str.getLength() ) { 248 int nStop; 249 int nStart = str.indexOf( "\"" , nFound ); 250 if( nStart < 0 || str.indexOf( "'" , nFound ) < nStart ) 251 { 252 nStart = str.indexOf( "'" , nFound ); 253 nStop = str.indexOf( "'" , nStart +1 ); 254 } 255 else 256 { 257 nStop = str.indexOf( "\"" , nStart +1); 258 } 259 if( nStart >= 0 && nStop >= 0 && nStart+1 < nStop ) 260 { 261 // encoding found finally 262 m_sEncoding = str.copy( nStart+1 , nStop - nStart - 1 ); 263 } 264 } 265 } 266 else if( 0xFE == pSource[0] && 267 0xFF == pSource[1] ) { 268 // UTF-16 big endian 269 // conversion is done so that encoding information can be easily extracted 270 m_sEncoding = "utf-16"; 271 } 272 else if( 0xFF == pSource[0] && 273 0xFE == pSource[1] ) { 274 // UTF-16 little endian 275 // conversion is done so that encoding information can be easily extracted 276 m_sEncoding = "utf-16"; 277 } 278 else if( 0x00 == pSource[0] && 0x3c == pSource[1] && 0x00 == pSource[2] && 0x3f == pSource[3] ) { 279 // UTF-16 big endian without byte order mark (this is (strictly speaking) an error.) 280 // The byte order mark is simply added 281 282 // simply add the byte order mark ! 283 seq.realloc( seq.getLength() + 2 ); 284 memmove( &( seq.getArray()[2] ) , seq.getArray() , seq.getLength() - 2 ); 285 ((sal_uInt8*)seq.getArray())[0] = 0xFE; 286 ((sal_uInt8*)seq.getArray())[1] = 0xFF; 287 288 m_sEncoding = "utf-16"; 289 } 290 else if( 0x3c == pSource[0] && 0x00 == pSource[1] && 0x3f == pSource[2] && 0x00 == pSource[3] ) { 291 // UTF-16 little endian without byte order mark (this is (strictly speaking) an error.) 292 // The byte order mark is simply added 293 294 seq.realloc( seq.getLength() + 2 ); 295 memmove( &( seq.getArray()[2] ) , seq.getArray() , seq.getLength() - 2 ); 296 ((sal_uInt8*)seq.getArray())[0] = 0xFF; 297 ((sal_uInt8*)seq.getArray())[1] = 0xFE; 298 299 m_sEncoding = "utf-16"; 300 } 301 else if( 0xEF == pSource[0] && 302 0xBB == pSource[1] && 303 0xBF == pSource[2] ) 304 { 305 // UTF-8 BOM (byte order mark); signifies utf-8, and not byte order 306 // The BOM is removed. 307 memmove( seq.getArray(), &( seq.getArray()[3] ), seq.getLength()-3 ); 308 seq.realloc( seq.getLength() - 3 ); 309 m_sEncoding = "utf-8"; 310 } 311 else if( 0x00 == pSource[0] && 0x00 == pSource[1] && 0x00 == pSource[2] && 0x3c == pSource[3] ) { 312 // UCS-4 big endian 313 m_sEncoding = "ucs-4"; 314 } 315 else if( 0x3c == pSource[0] && 0x00 == pSource[1] && 0x00 == pSource[2] && 0x00 == pSource[3] ) { 316 // UCS-4 little endian 317 m_sEncoding = "ucs-4"; 318 } 319 else if( 0x4c == pSource[0] && 0x6f == pSource[1] && 320 0xa7 == static_cast<unsigned char> (pSource[2]) && 321 0x94 == static_cast<unsigned char> (pSource[3]) ) { 322 // EBCDIC 323 bReturn = sal_False; // must be extended 324 } 325 else { 326 // other 327 // UTF8 is directly recognized by the parser. 328 bReturn = sal_False; 329 } 330 331 return bReturn; 332 } 333 334 void XMLFile2UTFConverter::initializeDecoding() 335 { 336 337 if( m_sEncoding.getLength() ) 338 { 339 rtl_TextEncoding encoding = rtl_getTextEncodingFromMimeCharset( m_sEncoding.getStr() ); 340 if( encoding != RTL_TEXTENCODING_UTF8 ) 341 { 342 m_pText2Unicode = new Text2UnicodeConverter( m_sEncoding ); 343 m_pUnicode2Text = new Unicode2TextConverter( RTL_TEXTENCODING_UTF8 ); 344 } 345 } 346 } 347 348 349 //---------------------------------------------- 350 // 351 // Text2UnicodeConverter 352 // 353 //---------------------------------------------- 354 Text2UnicodeConverter::Text2UnicodeConverter( const OString &sEncoding ) 355 { 356 rtl_TextEncoding encoding = rtl_getTextEncodingFromMimeCharset( sEncoding.getStr() ); 357 if( RTL_TEXTENCODING_DONTKNOW == encoding ) 358 { 359 m_bCanContinue = sal_False; 360 m_bInitialized = sal_False; 361 } 362 else 363 { 364 init( encoding ); 365 } 366 } 367 368 Text2UnicodeConverter::~Text2UnicodeConverter() 369 { 370 if( m_bInitialized ) 371 { 372 rtl_destroyTextToUnicodeContext( m_convText2Unicode , m_contextText2Unicode ); 373 rtl_destroyUnicodeToTextConverter( m_convText2Unicode ); 374 } 375 } 376 377 void Text2UnicodeConverter::init( rtl_TextEncoding encoding ) 378 { 379 m_bCanContinue = sal_True; 380 m_bInitialized = sal_True; 381 382 m_convText2Unicode = rtl_createTextToUnicodeConverter(encoding); 383 m_contextText2Unicode = rtl_createTextToUnicodeContext( m_convText2Unicode ); 384 m_rtlEncoding = encoding; 385 } 386 387 388 Sequence<sal_Unicode> Text2UnicodeConverter::convert( const Sequence<sal_Int8> &seqText ) 389 { 390 sal_uInt32 uiInfo; 391 sal_Size nSrcCvtBytes = 0; 392 sal_Size nTargetCount = 0; 393 sal_Size nSourceCount = 0; 394 395 // the whole source size 396 sal_Int32 nSourceSize = seqText.getLength() + m_seqSource.getLength(); 397 Sequence<sal_Unicode> seqUnicode ( nSourceSize ); 398 399 const sal_Int8 *pbSource = seqText.getConstArray(); 400 sal_Int8 *pbTempMem = 0; 401 402 if( m_seqSource.getLength() ) { 403 // put old rest and new byte sequence into one array 404 pbTempMem = new sal_Int8[ nSourceSize ]; 405 memcpy( pbTempMem , m_seqSource.getConstArray() , m_seqSource.getLength() ); 406 memcpy( &(pbTempMem[ m_seqSource.getLength() ]) , seqText.getConstArray() , seqText.getLength() ); 407 pbSource = pbTempMem; 408 409 // set to zero again 410 m_seqSource = Sequence< sal_Int8 >(); 411 } 412 413 while( sal_True ) { 414 415 /* All invalid characters are transformed to the unicode undefined char */ 416 nTargetCount += rtl_convertTextToUnicode( 417 m_convText2Unicode, 418 m_contextText2Unicode, 419 ( const sal_Char * ) &( pbSource[nSourceCount] ), 420 nSourceSize - nSourceCount , 421 &( seqUnicode.getArray()[ nTargetCount ] ), 422 seqUnicode.getLength() - nTargetCount, 423 RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_DEFAULT | 424 RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_DEFAULT | 425 RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT, 426 &uiInfo, 427 &nSrcCvtBytes ); 428 nSourceCount += nSrcCvtBytes; 429 430 if( uiInfo & RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL ) { 431 // save necessary bytes for next conversion 432 seqUnicode.realloc( seqUnicode.getLength() * 2 ); 433 continue; 434 } 435 break; 436 } 437 if( uiInfo & RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL ) { 438 m_seqSource.realloc( nSourceSize - nSourceCount ); 439 memcpy( m_seqSource.getArray() , &(pbSource[nSourceCount]) , nSourceSize-nSourceCount ); 440 } 441 442 443 if( pbTempMem ) { 444 delete [] pbTempMem; 445 } 446 447 // set to correct unicode size 448 seqUnicode.realloc( nTargetCount ); 449 450 return seqUnicode; 451 } 452 453 454 455 //---------------------------------------------- 456 // 457 // Unicode2TextConverter 458 // 459 //---------------------------------------------- 460 Unicode2TextConverter::Unicode2TextConverter( rtl_TextEncoding encoding ) 461 { 462 init( encoding ); 463 } 464 465 466 Unicode2TextConverter::~Unicode2TextConverter() 467 { 468 if( m_bInitialized ) { 469 rtl_destroyUnicodeToTextContext( m_convUnicode2Text , m_contextUnicode2Text ); 470 rtl_destroyUnicodeToTextConverter( m_convUnicode2Text ); 471 } 472 } 473 474 475 Sequence<sal_Int8> Unicode2TextConverter::convert(const sal_Unicode *puSource , sal_Int32 nSourceSize) 476 { 477 sal_Unicode *puTempMem = 0; 478 479 if( m_seqSource.getLength() ) { 480 // For surrogates ! 481 // put old rest and new byte sequence into one array 482 // In general when surrogates are used, they should be rarely 483 // cut off between two convert()-calls. So this code is used 484 // rarely and the extra copy is acceptable. 485 puTempMem = new sal_Unicode[ nSourceSize + m_seqSource.getLength()]; 486 memcpy( puTempMem , 487 m_seqSource.getConstArray() , 488 m_seqSource.getLength() * sizeof( sal_Unicode ) ); 489 memcpy( 490 &(puTempMem[ m_seqSource.getLength() ]) , 491 puSource , 492 nSourceSize*sizeof( sal_Unicode ) ); 493 puSource = puTempMem; 494 nSourceSize += m_seqSource.getLength(); 495 496 m_seqSource = Sequence< sal_Unicode > (); 497 } 498 499 500 sal_Size nTargetCount = 0; 501 sal_Size nSourceCount = 0; 502 503 sal_uInt32 uiInfo; 504 sal_Size nSrcCvtChars; 505 506 // take nSourceSize * 3 as preference 507 // this is an upper boundary for converting to utf8, 508 // which most often used as the target. 509 sal_Int32 nSeqSize = nSourceSize * 3; 510 511 Sequence<sal_Int8> seqText( nSeqSize ); 512 sal_Char *pTarget = (sal_Char *) seqText.getArray(); 513 while( sal_True ) { 514 515 nTargetCount += rtl_convertUnicodeToText( 516 m_convUnicode2Text, 517 m_contextUnicode2Text, 518 &( puSource[nSourceCount] ), 519 nSourceSize - nSourceCount , 520 &( pTarget[nTargetCount] ), 521 nSeqSize - nTargetCount, 522 RTL_UNICODETOTEXT_FLAGS_UNDEFINED_DEFAULT | 523 RTL_UNICODETOTEXT_FLAGS_INVALID_DEFAULT , 524 &uiInfo, 525 &nSrcCvtChars); 526 nSourceCount += nSrcCvtChars; 527 528 if( uiInfo & RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL ) { 529 nSeqSize = nSeqSize *2; 530 seqText.realloc( nSeqSize ); // double array size 531 pTarget = ( sal_Char * ) seqText.getArray(); 532 continue; 533 } 534 break; 535 } 536 537 // for surrogates 538 if( uiInfo & RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL ) { 539 m_seqSource.realloc( nSourceSize - nSourceCount ); 540 memcpy( m_seqSource.getArray() , 541 &(puSource[nSourceCount]), 542 (nSourceSize - nSourceCount) * sizeof( sal_Unicode ) ); 543 } 544 545 if( puTempMem ) { 546 delete [] puTempMem; 547 } 548 549 // reduce the size of the buffer (fast, no copy necessary) 550 seqText.realloc( nTargetCount ); 551 552 return seqText; 553 } 554 555 void Unicode2TextConverter::init( rtl_TextEncoding encoding ) 556 { 557 m_bCanContinue = sal_True; 558 m_bInitialized = sal_True; 559 560 m_convUnicode2Text = rtl_createUnicodeToTextConverter( encoding ); 561 m_contextUnicode2Text = rtl_createUnicodeToTextContext( m_convUnicode2Text ); 562 m_rtlEncoding = encoding; 563 }; 564 565 566 } 567