1*cdf0e10cSrcweir /************************************************************************* 2*cdf0e10cSrcweir * 3*cdf0e10cSrcweir * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4*cdf0e10cSrcweir * 5*cdf0e10cSrcweir * Copyright 2000, 2010 Oracle and/or its affiliates. 6*cdf0e10cSrcweir * 7*cdf0e10cSrcweir * OpenOffice.org - a multi-platform office productivity suite 8*cdf0e10cSrcweir * 9*cdf0e10cSrcweir * This file is part of OpenOffice.org. 10*cdf0e10cSrcweir * 11*cdf0e10cSrcweir * OpenOffice.org is free software: you can redistribute it and/or modify 12*cdf0e10cSrcweir * it under the terms of the GNU Lesser General Public License version 3 13*cdf0e10cSrcweir * only, as published by the Free Software Foundation. 14*cdf0e10cSrcweir * 15*cdf0e10cSrcweir * OpenOffice.org is distributed in the hope that it will be useful, 16*cdf0e10cSrcweir * but WITHOUT ANY WARRANTY; without even the implied warranty of 17*cdf0e10cSrcweir * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18*cdf0e10cSrcweir * GNU Lesser General Public License version 3 for more details 19*cdf0e10cSrcweir * (a copy is included in the LICENSE file that accompanied this code). 20*cdf0e10cSrcweir * 21*cdf0e10cSrcweir * You should have received a copy of the GNU Lesser General Public License 22*cdf0e10cSrcweir * version 3 along with OpenOffice.org. If not, see 23*cdf0e10cSrcweir * <http://www.openoffice.org/license.html> 24*cdf0e10cSrcweir * for a copy of the LGPLv3 License. 25*cdf0e10cSrcweir * 26*cdf0e10cSrcweir ************************************************************************/ 27*cdf0e10cSrcweir #include <string.h> 28*cdf0e10cSrcweir 29*cdf0e10cSrcweir #include <sal/types.h> 30*cdf0e10cSrcweir 31*cdf0e10cSrcweir #include <rtl/textenc.h> 32*cdf0e10cSrcweir #include <rtl/tencinfo.h> 33*cdf0e10cSrcweir 34*cdf0e10cSrcweir 35*cdf0e10cSrcweir #include <com/sun/star/io/XInputStream.hpp> 36*cdf0e10cSrcweir 37*cdf0e10cSrcweir using namespace rtl; 38*cdf0e10cSrcweir using namespace ::com::sun::star::uno; 39*cdf0e10cSrcweir using namespace ::com::sun::star::io; 40*cdf0e10cSrcweir 41*cdf0e10cSrcweir #include "xml2utf.hxx" 42*cdf0e10cSrcweir 43*cdf0e10cSrcweir namespace sax_expatwrap { 44*cdf0e10cSrcweir 45*cdf0e10cSrcweir sal_Int32 XMLFile2UTFConverter::readAndConvert( Sequence<sal_Int8> &seq , sal_Int32 nMaxToRead ) 46*cdf0e10cSrcweir throw ( IOException, NotConnectedException , BufferSizeExceededException , RuntimeException ) 47*cdf0e10cSrcweir { 48*cdf0e10cSrcweir 49*cdf0e10cSrcweir Sequence<sal_Int8> seqIn; 50*cdf0e10cSrcweir 51*cdf0e10cSrcweir if( ! m_in.is() ) { 52*cdf0e10cSrcweir throw NotConnectedException(); 53*cdf0e10cSrcweir } 54*cdf0e10cSrcweir if( ! m_bStarted ) { 55*cdf0e10cSrcweir nMaxToRead = Max( 512 , nMaxToRead ); // it should be possible to find the encoding attribute 56*cdf0e10cSrcweir // within the first 512 bytes == 128 chars in UCS-4 57*cdf0e10cSrcweir } 58*cdf0e10cSrcweir 59*cdf0e10cSrcweir sal_Int32 nRead; 60*cdf0e10cSrcweir Sequence< sal_Int8 > seqStart; 61*cdf0e10cSrcweir while( sal_True ) 62*cdf0e10cSrcweir { 63*cdf0e10cSrcweir nRead = m_in->readSomeBytes( seq , nMaxToRead ); 64*cdf0e10cSrcweir 65*cdf0e10cSrcweir if( nRead + seqStart.getLength()) 66*cdf0e10cSrcweir { 67*cdf0e10cSrcweir // if nRead is 0, the file is already eof. 68*cdf0e10cSrcweir if( ! m_bStarted && nRead ) 69*cdf0e10cSrcweir { 70*cdf0e10cSrcweir // ensure that enough data is available to parse encoding 71*cdf0e10cSrcweir if( seqStart.getLength() ) 72*cdf0e10cSrcweir { 73*cdf0e10cSrcweir // prefix with what we had so far. 74*cdf0e10cSrcweir sal_Int32 nLength = seq.getLength(); 75*cdf0e10cSrcweir seq.realloc( seqStart.getLength() + nLength ); 76*cdf0e10cSrcweir 77*cdf0e10cSrcweir memmove (seq.getArray() + seqStart.getLength(), 78*cdf0e10cSrcweir seq.getConstArray(), 79*cdf0e10cSrcweir nLength); 80*cdf0e10cSrcweir memcpy (seq.getArray(), 81*cdf0e10cSrcweir seqStart.getConstArray(), 82*cdf0e10cSrcweir seqStart.getLength()); 83*cdf0e10cSrcweir } 84*cdf0e10cSrcweir 85*cdf0e10cSrcweir // autodetection with the first bytes 86*cdf0e10cSrcweir if( ! isEncodingRecognizable( seq ) ) 87*cdf0e10cSrcweir { 88*cdf0e10cSrcweir // remember what we have so far. 89*cdf0e10cSrcweir seqStart = seq; 90*cdf0e10cSrcweir 91*cdf0e10cSrcweir // read more ! 92*cdf0e10cSrcweir continue; 93*cdf0e10cSrcweir } 94*cdf0e10cSrcweir if( scanForEncoding( seq ) || m_sEncoding.getLength() ) { 95*cdf0e10cSrcweir // initialize decoding 96*cdf0e10cSrcweir initializeDecoding(); 97*cdf0e10cSrcweir } 98*cdf0e10cSrcweir nRead = seq.getLength(); 99*cdf0e10cSrcweir seqStart = Sequence < sal_Int8 > (); 100*cdf0e10cSrcweir } 101*cdf0e10cSrcweir 102*cdf0e10cSrcweir // do the encoding 103*cdf0e10cSrcweir if( m_pText2Unicode && m_pUnicode2Text && 104*cdf0e10cSrcweir m_pText2Unicode->canContinue() && m_pUnicode2Text->canContinue() ) { 105*cdf0e10cSrcweir 106*cdf0e10cSrcweir Sequence<sal_Unicode> seqUnicode = m_pText2Unicode->convert( seq ); 107*cdf0e10cSrcweir seq = m_pUnicode2Text->convert( seqUnicode.getConstArray(), seqUnicode.getLength() ); 108*cdf0e10cSrcweir } 109*cdf0e10cSrcweir 110*cdf0e10cSrcweir if( ! m_bStarted ) 111*cdf0e10cSrcweir { 112*cdf0e10cSrcweir // it must now be ensured, that no encoding attribute exist anymore 113*cdf0e10cSrcweir // ( otherwise the expat-Parser will crash ) 114*cdf0e10cSrcweir // This must be done after decoding ! 115*cdf0e10cSrcweir // ( e.g. Files decoded in ucs-4 cannot be read properly ) 116*cdf0e10cSrcweir m_bStarted = sal_True; 117*cdf0e10cSrcweir removeEncoding( seq ); 118*cdf0e10cSrcweir } 119*cdf0e10cSrcweir nRead = seq.getLength(); 120*cdf0e10cSrcweir } 121*cdf0e10cSrcweir 122*cdf0e10cSrcweir break; 123*cdf0e10cSrcweir } 124*cdf0e10cSrcweir return nRead; 125*cdf0e10cSrcweir } 126*cdf0e10cSrcweir 127*cdf0e10cSrcweir 128*cdf0e10cSrcweir XMLFile2UTFConverter::~XMLFile2UTFConverter() 129*cdf0e10cSrcweir { 130*cdf0e10cSrcweir if( m_pText2Unicode ) 131*cdf0e10cSrcweir delete m_pText2Unicode; 132*cdf0e10cSrcweir if( m_pUnicode2Text ) 133*cdf0e10cSrcweir delete m_pUnicode2Text; 134*cdf0e10cSrcweir } 135*cdf0e10cSrcweir 136*cdf0e10cSrcweir 137*cdf0e10cSrcweir void XMLFile2UTFConverter::removeEncoding( Sequence<sal_Int8> &seq ) 138*cdf0e10cSrcweir { 139*cdf0e10cSrcweir const sal_Int8 *pSource = seq.getArray(); 140*cdf0e10cSrcweir if( ! strncmp( (const char * ) pSource , "<?xml" , 4) ) 141*cdf0e10cSrcweir { 142*cdf0e10cSrcweir 143*cdf0e10cSrcweir // scan for encoding 144*cdf0e10cSrcweir OString str( (sal_Char * ) pSource , seq.getLength() ); 145*cdf0e10cSrcweir 146*cdf0e10cSrcweir // cut sequence to first line break 147*cdf0e10cSrcweir // find first line break; 148*cdf0e10cSrcweir int nMax = str.indexOf( 10 ); 149*cdf0e10cSrcweir if( nMax >= 0 ) 150*cdf0e10cSrcweir { 151*cdf0e10cSrcweir str = str.copy( 0 , nMax ); 152*cdf0e10cSrcweir } 153*cdf0e10cSrcweir 154*cdf0e10cSrcweir int nFound = str.indexOf( " encoding" ); 155*cdf0e10cSrcweir if( nFound >= 0 ) { 156*cdf0e10cSrcweir int nStop; 157*cdf0e10cSrcweir int nStart = str.indexOf( "\"" , nFound ); 158*cdf0e10cSrcweir if( nStart < 0 || str.indexOf( "'" , nFound ) < nStart ) 159*cdf0e10cSrcweir { 160*cdf0e10cSrcweir nStart = str.indexOf( "'" , nFound ); 161*cdf0e10cSrcweir nStop = str.indexOf( "'" , nStart +1 ); 162*cdf0e10cSrcweir } 163*cdf0e10cSrcweir else 164*cdf0e10cSrcweir { 165*cdf0e10cSrcweir nStop = str.indexOf( "\"" , nStart +1); 166*cdf0e10cSrcweir } 167*cdf0e10cSrcweir 168*cdf0e10cSrcweir if( nStart >= 0 && nStop >= 0 && nStart+1 < nStop ) 169*cdf0e10cSrcweir { 170*cdf0e10cSrcweir // remove encoding tag from file 171*cdf0e10cSrcweir memmove( &( seq.getArray()[nFound] ) , 172*cdf0e10cSrcweir &( seq.getArray()[nStop+1]) , 173*cdf0e10cSrcweir seq.getLength() - nStop -1); 174*cdf0e10cSrcweir seq.realloc( seq.getLength() - ( nStop+1 - nFound ) ); 175*cdf0e10cSrcweir // str = String( (char * ) seq.getArray() , seq.getLen() ); 176*cdf0e10cSrcweir } 177*cdf0e10cSrcweir } 178*cdf0e10cSrcweir } 179*cdf0e10cSrcweir } 180*cdf0e10cSrcweir 181*cdf0e10cSrcweir // Checks, if enough data has been accumulated to recognize the encoding 182*cdf0e10cSrcweir sal_Bool XMLFile2UTFConverter::isEncodingRecognizable( const Sequence< sal_Int8 > &seq) 183*cdf0e10cSrcweir { 184*cdf0e10cSrcweir const sal_Int8 *pSource = seq.getConstArray(); 185*cdf0e10cSrcweir sal_Bool bCheckIfFirstClosingBracketExsists = sal_False; 186*cdf0e10cSrcweir 187*cdf0e10cSrcweir if( seq.getLength() < 8 ) { 188*cdf0e10cSrcweir // no recognition possible, when less than 8 bytes are available 189*cdf0e10cSrcweir return sal_False; 190*cdf0e10cSrcweir } 191*cdf0e10cSrcweir 192*cdf0e10cSrcweir if( ! strncmp( (const char * ) pSource , "<?xml" , 4 ) ) { 193*cdf0e10cSrcweir // scan if the <?xml tag finishes within this buffer 194*cdf0e10cSrcweir bCheckIfFirstClosingBracketExsists = sal_True; 195*cdf0e10cSrcweir } 196*cdf0e10cSrcweir else if( ('<' == pSource[0] || '<' == pSource[2] ) && 197*cdf0e10cSrcweir ( ('?' == pSource[4] || '?' == pSource[6] ) ) ) 198*cdf0e10cSrcweir { 199*cdf0e10cSrcweir // check for utf-16 200*cdf0e10cSrcweir bCheckIfFirstClosingBracketExsists = sal_True; 201*cdf0e10cSrcweir } 202*cdf0e10cSrcweir else if( ( '<' == pSource[1] || '<' == pSource[3] ) && 203*cdf0e10cSrcweir ( '?' == pSource[5] || '?' == pSource[7] ) ) 204*cdf0e10cSrcweir { 205*cdf0e10cSrcweir // check for 206*cdf0e10cSrcweir bCheckIfFirstClosingBracketExsists = sal_True; 207*cdf0e10cSrcweir } 208*cdf0e10cSrcweir 209*cdf0e10cSrcweir if( bCheckIfFirstClosingBracketExsists ) 210*cdf0e10cSrcweir { 211*cdf0e10cSrcweir for( sal_Int32 i = 0; i < seq.getLength() ; i ++ ) 212*cdf0e10cSrcweir { 213*cdf0e10cSrcweir // whole <?xml tag is valid 214*cdf0e10cSrcweir if( '>' == pSource[ i ] ) 215*cdf0e10cSrcweir { 216*cdf0e10cSrcweir return sal_True; 217*cdf0e10cSrcweir } 218*cdf0e10cSrcweir } 219*cdf0e10cSrcweir return sal_False; 220*cdf0e10cSrcweir } 221*cdf0e10cSrcweir 222*cdf0e10cSrcweir // No <? tag in front, no need for a bigger buffer 223*cdf0e10cSrcweir return sal_True; 224*cdf0e10cSrcweir } 225*cdf0e10cSrcweir 226*cdf0e10cSrcweir sal_Bool XMLFile2UTFConverter::scanForEncoding( Sequence< sal_Int8 > &seq ) 227*cdf0e10cSrcweir { 228*cdf0e10cSrcweir const sal_uInt8 *pSource = reinterpret_cast<const sal_uInt8*>( seq.getConstArray() ); 229*cdf0e10cSrcweir sal_Bool bReturn = sal_True; 230*cdf0e10cSrcweir 231*cdf0e10cSrcweir if( seq.getLength() < 4 ) { 232*cdf0e10cSrcweir // no recognition possible, when less than 4 bytes are available 233*cdf0e10cSrcweir return sal_False; 234*cdf0e10cSrcweir } 235*cdf0e10cSrcweir 236*cdf0e10cSrcweir // first level : detect possible file formats 237*cdf0e10cSrcweir if( ! strncmp( (const char * ) pSource , "<?xml" , 4 ) ) { 238*cdf0e10cSrcweir 239*cdf0e10cSrcweir // scan for encoding 240*cdf0e10cSrcweir OString str( (const sal_Char *) pSource , seq.getLength() ); 241*cdf0e10cSrcweir 242*cdf0e10cSrcweir // cut sequence to first line break 243*cdf0e10cSrcweir //find first line break; 244*cdf0e10cSrcweir int nMax = str.indexOf( 10 ); 245*cdf0e10cSrcweir if( nMax >= 0 ) 246*cdf0e10cSrcweir { 247*cdf0e10cSrcweir str = str.copy( 0 , nMax ); 248*cdf0e10cSrcweir } 249*cdf0e10cSrcweir 250*cdf0e10cSrcweir int nFound = str.indexOf( " encoding" ); 251*cdf0e10cSrcweir if( nFound < str.getLength() ) { 252*cdf0e10cSrcweir int nStop; 253*cdf0e10cSrcweir int nStart = str.indexOf( "\"" , nFound ); 254*cdf0e10cSrcweir if( nStart < 0 || str.indexOf( "'" , nFound ) < nStart ) 255*cdf0e10cSrcweir { 256*cdf0e10cSrcweir nStart = str.indexOf( "'" , nFound ); 257*cdf0e10cSrcweir nStop = str.indexOf( "'" , nStart +1 ); 258*cdf0e10cSrcweir } 259*cdf0e10cSrcweir else 260*cdf0e10cSrcweir { 261*cdf0e10cSrcweir nStop = str.indexOf( "\"" , nStart +1); 262*cdf0e10cSrcweir } 263*cdf0e10cSrcweir if( nStart >= 0 && nStop >= 0 && nStart+1 < nStop ) 264*cdf0e10cSrcweir { 265*cdf0e10cSrcweir // encoding found finally 266*cdf0e10cSrcweir m_sEncoding = str.copy( nStart+1 , nStop - nStart - 1 ); 267*cdf0e10cSrcweir } 268*cdf0e10cSrcweir } 269*cdf0e10cSrcweir } 270*cdf0e10cSrcweir else if( 0xFE == pSource[0] && 271*cdf0e10cSrcweir 0xFF == pSource[1] ) { 272*cdf0e10cSrcweir // UTF-16 big endian 273*cdf0e10cSrcweir // conversion is done so that encoding information can be easily extracted 274*cdf0e10cSrcweir m_sEncoding = "utf-16"; 275*cdf0e10cSrcweir } 276*cdf0e10cSrcweir else if( 0xFF == pSource[0] && 277*cdf0e10cSrcweir 0xFE == pSource[1] ) { 278*cdf0e10cSrcweir // UTF-16 little endian 279*cdf0e10cSrcweir // conversion is done so that encoding information can be easily extracted 280*cdf0e10cSrcweir m_sEncoding = "utf-16"; 281*cdf0e10cSrcweir } 282*cdf0e10cSrcweir else if( 0x00 == pSource[0] && 0x3c == pSource[1] && 0x00 == pSource[2] && 0x3f == pSource[3] ) { 283*cdf0e10cSrcweir // UTF-16 big endian without byte order mark (this is (strictly speaking) an error.) 284*cdf0e10cSrcweir // The byte order mark is simply added 285*cdf0e10cSrcweir 286*cdf0e10cSrcweir // simply add the byte order mark ! 287*cdf0e10cSrcweir seq.realloc( seq.getLength() + 2 ); 288*cdf0e10cSrcweir memmove( &( seq.getArray()[2] ) , seq.getArray() , seq.getLength() - 2 ); 289*cdf0e10cSrcweir ((sal_uInt8*)seq.getArray())[0] = 0xFE; 290*cdf0e10cSrcweir ((sal_uInt8*)seq.getArray())[1] = 0xFF; 291*cdf0e10cSrcweir 292*cdf0e10cSrcweir m_sEncoding = "utf-16"; 293*cdf0e10cSrcweir } 294*cdf0e10cSrcweir else if( 0x3c == pSource[0] && 0x00 == pSource[1] && 0x3f == pSource[2] && 0x00 == pSource[3] ) { 295*cdf0e10cSrcweir // UTF-16 little endian without byte order mark (this is (strictly speaking) an error.) 296*cdf0e10cSrcweir // The byte order mark is simply added 297*cdf0e10cSrcweir 298*cdf0e10cSrcweir seq.realloc( seq.getLength() + 2 ); 299*cdf0e10cSrcweir memmove( &( seq.getArray()[2] ) , seq.getArray() , seq.getLength() - 2 ); 300*cdf0e10cSrcweir ((sal_uInt8*)seq.getArray())[0] = 0xFF; 301*cdf0e10cSrcweir ((sal_uInt8*)seq.getArray())[1] = 0xFE; 302*cdf0e10cSrcweir 303*cdf0e10cSrcweir m_sEncoding = "utf-16"; 304*cdf0e10cSrcweir } 305*cdf0e10cSrcweir else if( 0xEF == pSource[0] && 306*cdf0e10cSrcweir 0xBB == pSource[1] && 307*cdf0e10cSrcweir 0xBF == pSource[2] ) 308*cdf0e10cSrcweir { 309*cdf0e10cSrcweir // UTF-8 BOM (byte order mark); signifies utf-8, and not byte order 310*cdf0e10cSrcweir // The BOM is removed. 311*cdf0e10cSrcweir memmove( seq.getArray(), &( seq.getArray()[3] ), seq.getLength()-3 ); 312*cdf0e10cSrcweir seq.realloc( seq.getLength() - 3 ); 313*cdf0e10cSrcweir m_sEncoding = "utf-8"; 314*cdf0e10cSrcweir } 315*cdf0e10cSrcweir else if( 0x00 == pSource[0] && 0x00 == pSource[1] && 0x00 == pSource[2] && 0x3c == pSource[3] ) { 316*cdf0e10cSrcweir // UCS-4 big endian 317*cdf0e10cSrcweir m_sEncoding = "ucs-4"; 318*cdf0e10cSrcweir } 319*cdf0e10cSrcweir else if( 0x3c == pSource[0] && 0x00 == pSource[1] && 0x00 == pSource[2] && 0x00 == pSource[3] ) { 320*cdf0e10cSrcweir // UCS-4 little endian 321*cdf0e10cSrcweir m_sEncoding = "ucs-4"; 322*cdf0e10cSrcweir } 323*cdf0e10cSrcweir else if( 0x4c == pSource[0] && 0x6f == pSource[1] && 324*cdf0e10cSrcweir 0xa7 == static_cast<unsigned char> (pSource[2]) && 325*cdf0e10cSrcweir 0x94 == static_cast<unsigned char> (pSource[3]) ) { 326*cdf0e10cSrcweir // EBCDIC 327*cdf0e10cSrcweir bReturn = sal_False; // must be extended 328*cdf0e10cSrcweir } 329*cdf0e10cSrcweir else { 330*cdf0e10cSrcweir // other 331*cdf0e10cSrcweir // UTF8 is directly recognized by the parser. 332*cdf0e10cSrcweir bReturn = sal_False; 333*cdf0e10cSrcweir } 334*cdf0e10cSrcweir 335*cdf0e10cSrcweir return bReturn; 336*cdf0e10cSrcweir } 337*cdf0e10cSrcweir 338*cdf0e10cSrcweir void XMLFile2UTFConverter::initializeDecoding() 339*cdf0e10cSrcweir { 340*cdf0e10cSrcweir 341*cdf0e10cSrcweir if( m_sEncoding.getLength() ) 342*cdf0e10cSrcweir { 343*cdf0e10cSrcweir rtl_TextEncoding encoding = rtl_getTextEncodingFromMimeCharset( m_sEncoding.getStr() ); 344*cdf0e10cSrcweir if( encoding != RTL_TEXTENCODING_UTF8 ) 345*cdf0e10cSrcweir { 346*cdf0e10cSrcweir m_pText2Unicode = new Text2UnicodeConverter( m_sEncoding ); 347*cdf0e10cSrcweir m_pUnicode2Text = new Unicode2TextConverter( RTL_TEXTENCODING_UTF8 ); 348*cdf0e10cSrcweir } 349*cdf0e10cSrcweir } 350*cdf0e10cSrcweir } 351*cdf0e10cSrcweir 352*cdf0e10cSrcweir 353*cdf0e10cSrcweir //---------------------------------------------- 354*cdf0e10cSrcweir // 355*cdf0e10cSrcweir // Text2UnicodeConverter 356*cdf0e10cSrcweir // 357*cdf0e10cSrcweir //---------------------------------------------- 358*cdf0e10cSrcweir Text2UnicodeConverter::Text2UnicodeConverter( const OString &sEncoding ) 359*cdf0e10cSrcweir { 360*cdf0e10cSrcweir rtl_TextEncoding encoding = rtl_getTextEncodingFromMimeCharset( sEncoding.getStr() ); 361*cdf0e10cSrcweir if( RTL_TEXTENCODING_DONTKNOW == encoding ) 362*cdf0e10cSrcweir { 363*cdf0e10cSrcweir m_bCanContinue = sal_False; 364*cdf0e10cSrcweir m_bInitialized = sal_False; 365*cdf0e10cSrcweir } 366*cdf0e10cSrcweir else 367*cdf0e10cSrcweir { 368*cdf0e10cSrcweir init( encoding ); 369*cdf0e10cSrcweir } 370*cdf0e10cSrcweir } 371*cdf0e10cSrcweir 372*cdf0e10cSrcweir Text2UnicodeConverter::~Text2UnicodeConverter() 373*cdf0e10cSrcweir { 374*cdf0e10cSrcweir if( m_bInitialized ) 375*cdf0e10cSrcweir { 376*cdf0e10cSrcweir rtl_destroyTextToUnicodeContext( m_convText2Unicode , m_contextText2Unicode ); 377*cdf0e10cSrcweir rtl_destroyUnicodeToTextConverter( m_convText2Unicode ); 378*cdf0e10cSrcweir } 379*cdf0e10cSrcweir } 380*cdf0e10cSrcweir 381*cdf0e10cSrcweir void Text2UnicodeConverter::init( rtl_TextEncoding encoding ) 382*cdf0e10cSrcweir { 383*cdf0e10cSrcweir m_bCanContinue = sal_True; 384*cdf0e10cSrcweir m_bInitialized = sal_True; 385*cdf0e10cSrcweir 386*cdf0e10cSrcweir m_convText2Unicode = rtl_createTextToUnicodeConverter(encoding); 387*cdf0e10cSrcweir m_contextText2Unicode = rtl_createTextToUnicodeContext( m_convText2Unicode ); 388*cdf0e10cSrcweir m_rtlEncoding = encoding; 389*cdf0e10cSrcweir } 390*cdf0e10cSrcweir 391*cdf0e10cSrcweir 392*cdf0e10cSrcweir Sequence<sal_Unicode> Text2UnicodeConverter::convert( const Sequence<sal_Int8> &seqText ) 393*cdf0e10cSrcweir { 394*cdf0e10cSrcweir sal_uInt32 uiInfo; 395*cdf0e10cSrcweir sal_Size nSrcCvtBytes = 0; 396*cdf0e10cSrcweir sal_Size nTargetCount = 0; 397*cdf0e10cSrcweir sal_Size nSourceCount = 0; 398*cdf0e10cSrcweir 399*cdf0e10cSrcweir // the whole source size 400*cdf0e10cSrcweir sal_Int32 nSourceSize = seqText.getLength() + m_seqSource.getLength(); 401*cdf0e10cSrcweir Sequence<sal_Unicode> seqUnicode ( nSourceSize ); 402*cdf0e10cSrcweir 403*cdf0e10cSrcweir const sal_Int8 *pbSource = seqText.getConstArray(); 404*cdf0e10cSrcweir sal_Int8 *pbTempMem = 0; 405*cdf0e10cSrcweir 406*cdf0e10cSrcweir if( m_seqSource.getLength() ) { 407*cdf0e10cSrcweir // put old rest and new byte sequence into one array 408*cdf0e10cSrcweir pbTempMem = new sal_Int8[ nSourceSize ]; 409*cdf0e10cSrcweir memcpy( pbTempMem , m_seqSource.getConstArray() , m_seqSource.getLength() ); 410*cdf0e10cSrcweir memcpy( &(pbTempMem[ m_seqSource.getLength() ]) , seqText.getConstArray() , seqText.getLength() ); 411*cdf0e10cSrcweir pbSource = pbTempMem; 412*cdf0e10cSrcweir 413*cdf0e10cSrcweir // set to zero again 414*cdf0e10cSrcweir m_seqSource = Sequence< sal_Int8 >(); 415*cdf0e10cSrcweir } 416*cdf0e10cSrcweir 417*cdf0e10cSrcweir while( sal_True ) { 418*cdf0e10cSrcweir 419*cdf0e10cSrcweir /* All invalid characters are transformed to the unicode undefined char */ 420*cdf0e10cSrcweir nTargetCount += rtl_convertTextToUnicode( 421*cdf0e10cSrcweir m_convText2Unicode, 422*cdf0e10cSrcweir m_contextText2Unicode, 423*cdf0e10cSrcweir ( const sal_Char * ) &( pbSource[nSourceCount] ), 424*cdf0e10cSrcweir nSourceSize - nSourceCount , 425*cdf0e10cSrcweir &( seqUnicode.getArray()[ nTargetCount ] ), 426*cdf0e10cSrcweir seqUnicode.getLength() - nTargetCount, 427*cdf0e10cSrcweir RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_DEFAULT | 428*cdf0e10cSrcweir RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_DEFAULT | 429*cdf0e10cSrcweir RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT, 430*cdf0e10cSrcweir &uiInfo, 431*cdf0e10cSrcweir &nSrcCvtBytes ); 432*cdf0e10cSrcweir nSourceCount += nSrcCvtBytes; 433*cdf0e10cSrcweir 434*cdf0e10cSrcweir if( uiInfo & RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL ) { 435*cdf0e10cSrcweir // save necessary bytes for next conversion 436*cdf0e10cSrcweir seqUnicode.realloc( seqUnicode.getLength() * 2 ); 437*cdf0e10cSrcweir continue; 438*cdf0e10cSrcweir } 439*cdf0e10cSrcweir break; 440*cdf0e10cSrcweir } 441*cdf0e10cSrcweir if( uiInfo & RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL ) { 442*cdf0e10cSrcweir m_seqSource.realloc( nSourceSize - nSourceCount ); 443*cdf0e10cSrcweir memcpy( m_seqSource.getArray() , &(pbSource[nSourceCount]) , nSourceSize-nSourceCount ); 444*cdf0e10cSrcweir } 445*cdf0e10cSrcweir 446*cdf0e10cSrcweir 447*cdf0e10cSrcweir if( pbTempMem ) { 448*cdf0e10cSrcweir delete [] pbTempMem; 449*cdf0e10cSrcweir } 450*cdf0e10cSrcweir 451*cdf0e10cSrcweir // set to correct unicode size 452*cdf0e10cSrcweir seqUnicode.realloc( nTargetCount ); 453*cdf0e10cSrcweir 454*cdf0e10cSrcweir return seqUnicode; 455*cdf0e10cSrcweir } 456*cdf0e10cSrcweir 457*cdf0e10cSrcweir 458*cdf0e10cSrcweir 459*cdf0e10cSrcweir //---------------------------------------------- 460*cdf0e10cSrcweir // 461*cdf0e10cSrcweir // Unicode2TextConverter 462*cdf0e10cSrcweir // 463*cdf0e10cSrcweir //---------------------------------------------- 464*cdf0e10cSrcweir Unicode2TextConverter::Unicode2TextConverter( rtl_TextEncoding encoding ) 465*cdf0e10cSrcweir { 466*cdf0e10cSrcweir init( encoding ); 467*cdf0e10cSrcweir } 468*cdf0e10cSrcweir 469*cdf0e10cSrcweir 470*cdf0e10cSrcweir Unicode2TextConverter::~Unicode2TextConverter() 471*cdf0e10cSrcweir { 472*cdf0e10cSrcweir if( m_bInitialized ) { 473*cdf0e10cSrcweir rtl_destroyUnicodeToTextContext( m_convUnicode2Text , m_contextUnicode2Text ); 474*cdf0e10cSrcweir rtl_destroyUnicodeToTextConverter( m_convUnicode2Text ); 475*cdf0e10cSrcweir } 476*cdf0e10cSrcweir } 477*cdf0e10cSrcweir 478*cdf0e10cSrcweir 479*cdf0e10cSrcweir Sequence<sal_Int8> Unicode2TextConverter::convert(const sal_Unicode *puSource , sal_Int32 nSourceSize) 480*cdf0e10cSrcweir { 481*cdf0e10cSrcweir sal_Unicode *puTempMem = 0; 482*cdf0e10cSrcweir 483*cdf0e10cSrcweir if( m_seqSource.getLength() ) { 484*cdf0e10cSrcweir // For surrogates ! 485*cdf0e10cSrcweir // put old rest and new byte sequence into one array 486*cdf0e10cSrcweir // In general when surrogates are used, they should be rarely 487*cdf0e10cSrcweir // cut off between two convert()-calls. So this code is used 488*cdf0e10cSrcweir // rarely and the extra copy is acceptable. 489*cdf0e10cSrcweir puTempMem = new sal_Unicode[ nSourceSize + m_seqSource.getLength()]; 490*cdf0e10cSrcweir memcpy( puTempMem , 491*cdf0e10cSrcweir m_seqSource.getConstArray() , 492*cdf0e10cSrcweir m_seqSource.getLength() * sizeof( sal_Unicode ) ); 493*cdf0e10cSrcweir memcpy( 494*cdf0e10cSrcweir &(puTempMem[ m_seqSource.getLength() ]) , 495*cdf0e10cSrcweir puSource , 496*cdf0e10cSrcweir nSourceSize*sizeof( sal_Unicode ) ); 497*cdf0e10cSrcweir puSource = puTempMem; 498*cdf0e10cSrcweir nSourceSize += m_seqSource.getLength(); 499*cdf0e10cSrcweir 500*cdf0e10cSrcweir m_seqSource = Sequence< sal_Unicode > (); 501*cdf0e10cSrcweir } 502*cdf0e10cSrcweir 503*cdf0e10cSrcweir 504*cdf0e10cSrcweir sal_Size nTargetCount = 0; 505*cdf0e10cSrcweir sal_Size nSourceCount = 0; 506*cdf0e10cSrcweir 507*cdf0e10cSrcweir sal_uInt32 uiInfo; 508*cdf0e10cSrcweir sal_Size nSrcCvtChars; 509*cdf0e10cSrcweir 510*cdf0e10cSrcweir // take nSourceSize * 3 as preference 511*cdf0e10cSrcweir // this is an upper boundary for converting to utf8, 512*cdf0e10cSrcweir // which most often used as the target. 513*cdf0e10cSrcweir sal_Int32 nSeqSize = nSourceSize * 3; 514*cdf0e10cSrcweir 515*cdf0e10cSrcweir Sequence<sal_Int8> seqText( nSeqSize ); 516*cdf0e10cSrcweir sal_Char *pTarget = (sal_Char *) seqText.getArray(); 517*cdf0e10cSrcweir while( sal_True ) { 518*cdf0e10cSrcweir 519*cdf0e10cSrcweir nTargetCount += rtl_convertUnicodeToText( 520*cdf0e10cSrcweir m_convUnicode2Text, 521*cdf0e10cSrcweir m_contextUnicode2Text, 522*cdf0e10cSrcweir &( puSource[nSourceCount] ), 523*cdf0e10cSrcweir nSourceSize - nSourceCount , 524*cdf0e10cSrcweir &( pTarget[nTargetCount] ), 525*cdf0e10cSrcweir nSeqSize - nTargetCount, 526*cdf0e10cSrcweir RTL_UNICODETOTEXT_FLAGS_UNDEFINED_DEFAULT | 527*cdf0e10cSrcweir RTL_UNICODETOTEXT_FLAGS_INVALID_DEFAULT , 528*cdf0e10cSrcweir &uiInfo, 529*cdf0e10cSrcweir &nSrcCvtChars); 530*cdf0e10cSrcweir nSourceCount += nSrcCvtChars; 531*cdf0e10cSrcweir 532*cdf0e10cSrcweir if( uiInfo & RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL ) { 533*cdf0e10cSrcweir nSeqSize = nSeqSize *2; 534*cdf0e10cSrcweir seqText.realloc( nSeqSize ); // double array size 535*cdf0e10cSrcweir pTarget = ( sal_Char * ) seqText.getArray(); 536*cdf0e10cSrcweir continue; 537*cdf0e10cSrcweir } 538*cdf0e10cSrcweir break; 539*cdf0e10cSrcweir } 540*cdf0e10cSrcweir 541*cdf0e10cSrcweir // for surrogates 542*cdf0e10cSrcweir if( uiInfo & RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL ) { 543*cdf0e10cSrcweir m_seqSource.realloc( nSourceSize - nSourceCount ); 544*cdf0e10cSrcweir memcpy( m_seqSource.getArray() , 545*cdf0e10cSrcweir &(puSource[nSourceCount]), 546*cdf0e10cSrcweir (nSourceSize - nSourceCount) * sizeof( sal_Unicode ) ); 547*cdf0e10cSrcweir } 548*cdf0e10cSrcweir 549*cdf0e10cSrcweir if( puTempMem ) { 550*cdf0e10cSrcweir delete [] puTempMem; 551*cdf0e10cSrcweir } 552*cdf0e10cSrcweir 553*cdf0e10cSrcweir // reduce the size of the buffer (fast, no copy necessary) 554*cdf0e10cSrcweir seqText.realloc( nTargetCount ); 555*cdf0e10cSrcweir 556*cdf0e10cSrcweir return seqText; 557*cdf0e10cSrcweir } 558*cdf0e10cSrcweir 559*cdf0e10cSrcweir void Unicode2TextConverter::init( rtl_TextEncoding encoding ) 560*cdf0e10cSrcweir { 561*cdf0e10cSrcweir m_bCanContinue = sal_True; 562*cdf0e10cSrcweir m_bInitialized = sal_True; 563*cdf0e10cSrcweir 564*cdf0e10cSrcweir m_convUnicode2Text = rtl_createUnicodeToTextConverter( encoding ); 565*cdf0e10cSrcweir m_contextUnicode2Text = rtl_createUnicodeToTextContext( m_convUnicode2Text ); 566*cdf0e10cSrcweir m_rtlEncoding = encoding; 567*cdf0e10cSrcweir }; 568*cdf0e10cSrcweir 569*cdf0e10cSrcweir 570*cdf0e10cSrcweir } 571