xref: /AOO41X/main/sax/source/expatwrap/xml2utf.cxx (revision f9b72d1151c0405011e988af4c8d57514307e7a3) !
1 /**************************************************************
2  *
3  * Licensed to the Apache Software Foundation (ASF) under one
4  * or more contributor license agreements.  See the NOTICE file
5  * distributed with this work for additional information
6  * regarding copyright ownership.  The ASF licenses this file
7  * to you under the Apache License, Version 2.0 (the
8  * "License"); you may not use this file except in compliance
9  * with the License.  You may obtain a copy of the License at
10  *
11  *   http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing,
14  * software distributed under the License is distributed on an
15  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16  * KIND, either express or implied.  See the License for the
17  * specific language governing permissions and limitations
18  * under the License.
19  *
20  *************************************************************/
21 
22 
23 #include <string.h>
24 
25 #include <sal/types.h>
26 
27 #include <rtl/textenc.h>
28 #include <rtl/tencinfo.h>
29 
30 
31 #include <com/sun/star/io/XInputStream.hpp>
32 
33 using namespace rtl;
34 using namespace ::com::sun::star::uno;
35 using namespace ::com::sun::star::io;
36 
37 #include "xml2utf.hxx"
38 
39 namespace sax_expatwrap {
40 
readAndConvert(Sequence<sal_Int8> & seq,sal_Int32 nMaxToRead)41 sal_Int32 XMLFile2UTFConverter::readAndConvert( Sequence<sal_Int8> &seq , sal_Int32 nMaxToRead )
42     throw ( IOException, NotConnectedException , BufferSizeExceededException , RuntimeException )
43 {
44 
45     Sequence<sal_Int8> seqIn;
46 
47     if( ! m_in.is() ) {
48         throw NotConnectedException();
49     }
50     if( ! m_bStarted ) {
51         nMaxToRead = Max( 512 , nMaxToRead );   // it should be possible to find the encoding attribute
52                                                 // within the first 512 bytes == 128 chars in UCS-4
53     }
54 
55     sal_Int32 nRead;
56     Sequence< sal_Int8 > seqStart;
57     while( sal_True )
58     {
59         nRead = m_in->readSomeBytes( seq , nMaxToRead );
60 
61         if( nRead + seqStart.getLength())
62         {
63             // if nRead is 0, the file is already eof.
64             if( ! m_bStarted && nRead )
65             {
66                 // ensure that enough data is available to parse encoding
67                 if( seqStart.getLength() )
68                 {
69                   // prefix with what we had so far.
70                   sal_Int32 nLength = seq.getLength();
71                   seq.realloc( seqStart.getLength() + nLength );
72 
73                   memmove (seq.getArray() + seqStart.getLength(),
74                        seq.getConstArray(),
75                        nLength);
76                   memcpy  (seq.getArray(),
77                        seqStart.getConstArray(),
78                        seqStart.getLength());
79                 }
80 
81                 // autodetection with the first bytes
82                 if( ! isEncodingRecognizable( seq ) )
83                 {
84                   // remember what we have so far.
85                   seqStart = seq;
86 
87                   // read more !
88                   continue;
89                 }
90                 if( scanForEncoding( seq ) || m_sEncoding.getLength() ) {
91                     // initialize decoding
92                     initializeDecoding();
93                 }
94                 nRead = seq.getLength();
95                 seqStart = Sequence < sal_Int8 > ();
96             }
97 
98             // do the encoding
99             if( m_pText2Unicode && m_pUnicode2Text &&
100                 m_pText2Unicode->canContinue() && m_pUnicode2Text->canContinue() ) {
101 
102                 Sequence<sal_Unicode> seqUnicode = m_pText2Unicode->convert( seq );
103                 seq = m_pUnicode2Text->convert( seqUnicode.getConstArray(), seqUnicode.getLength() );
104             }
105 
106             if( ! m_bStarted )
107             {
108                 // it must now be ensured, that no encoding attribute exist anymore
109                 // ( otherwise the expat-Parser will crash )
110                 // This must be done after decoding !
111                 // ( e.g. Files decoded in ucs-4 cannot be read properly )
112                 m_bStarted = sal_True;
113                 removeEncoding( seq );
114             }
115             nRead = seq.getLength();
116         }
117 
118         break;
119     }
120     return nRead;
121 }
122 
123 
~XMLFile2UTFConverter()124 XMLFile2UTFConverter::~XMLFile2UTFConverter()
125 {
126     if( m_pText2Unicode )
127         delete m_pText2Unicode;
128     if( m_pUnicode2Text )
129         delete m_pUnicode2Text;
130 }
131 
132 
removeEncoding(Sequence<sal_Int8> & seq)133 void XMLFile2UTFConverter::removeEncoding( Sequence<sal_Int8> &seq )
134 {
135     const sal_Int8 *pSource = seq.getArray();
136     if( ! strncmp( (const char * ) pSource , "<?xml" , 4) )
137     {
138 
139         // scan for encoding
140         OString str( (sal_Char * ) pSource , seq.getLength() );
141 
142         // cut sequence to first line break
143         // find first line break;
144         int nMax = str.indexOf( 10 );
145         if( nMax >= 0 )
146         {
147             str = str.copy( 0 , nMax );
148         }
149 
150         int nFound = str.indexOf( " encoding" );
151         if( nFound >= 0 ) {
152             int nStop;
153             int nStart = str.indexOf( "\"" , nFound );
154             if( nStart < 0 || str.indexOf( "'" , nFound ) < nStart )
155             {
156                 nStart = str.indexOf( "'" , nFound );
157                 nStop  = str.indexOf( "'" , nStart +1 );
158             }
159             else
160             {
161                 nStop  = str.indexOf( "\"" , nStart +1);
162             }
163 
164             if( nStart >= 0 && nStop >= 0 && nStart+1 < nStop )
165             {
166                 // remove encoding tag from file
167                 memmove(        &( seq.getArray()[nFound] ) ,
168                                 &( seq.getArray()[nStop+1]) ,
169                                 seq.getLength() - nStop -1);
170                 seq.realloc( seq.getLength() - ( nStop+1 - nFound ) );
171 //              str = String( (char * ) seq.getArray() , seq.getLen() );
172             }
173         }
174     }
175 }
176 
177 // Checks, if enough data has been accumulated to recognize the encoding
isEncodingRecognizable(const Sequence<sal_Int8> & seq)178 sal_Bool XMLFile2UTFConverter::isEncodingRecognizable( const Sequence< sal_Int8 > &seq)
179 {
180     const sal_Int8 *pSource = seq.getConstArray();
181     sal_Bool bCheckIfFirstClosingBracketExsists = sal_False;
182 
183     if( seq.getLength() < 8 ) {
184         // no recognition possible, when less than 8 bytes are available
185         return sal_False;
186     }
187 
188     if( ! strncmp( (const char * ) pSource , "<?xml" , 4 ) ) {
189         // scan if the <?xml tag finishes within this buffer
190         bCheckIfFirstClosingBracketExsists = sal_True;
191     }
192     else if( ('<' == pSource[0] || '<' == pSource[2] ) &&
193              ( ('?' == pSource[4] || '?' == pSource[6] ) ) )
194     {
195         // check for utf-16
196         bCheckIfFirstClosingBracketExsists = sal_True;
197     }
198     else if( ( '<' == pSource[1] || '<' == pSource[3] ) &&
199              ( '?' == pSource[5] || '?' == pSource[7] ) )
200     {
201         // check for
202         bCheckIfFirstClosingBracketExsists = sal_True;
203     }
204 
205     if( bCheckIfFirstClosingBracketExsists )
206     {
207         for( sal_Int32 i = 0; i < seq.getLength() ; i ++ )
208         {
209             // whole <?xml tag is valid
210             if( '>' == pSource[ i ] )
211             {
212                 return sal_True;
213             }
214         }
215         return sal_False;
216     }
217 
218     // No <? tag in front, no need for a bigger buffer
219     return sal_True;
220 }
221 
scanForEncoding(Sequence<sal_Int8> & seq)222 sal_Bool XMLFile2UTFConverter::scanForEncoding( Sequence< sal_Int8 > &seq )
223 {
224     const sal_uInt8 *pSource = reinterpret_cast<const sal_uInt8*>( seq.getConstArray() );
225     sal_Bool bReturn = sal_True;
226 
227     if( seq.getLength() < 4 ) {
228         // no recognition possible, when less than 4 bytes are available
229         return sal_False;
230     }
231 
232     // first level : detect possible file formats
233     if( ! strncmp( (const char * ) pSource , "<?xml" , 4 ) ) {
234 
235         // scan for encoding
236         OString str( (const sal_Char *) pSource , seq.getLength() );
237 
238         // cut sequence to first line break
239         //find first line break;
240         int nMax = str.indexOf( 10 );
241         if( nMax >= 0 )
242         {
243             str = str.copy( 0 , nMax );
244         }
245 
246         int nFound = str.indexOf( " encoding" );
247         if( nFound < str.getLength() ) {
248             int nStop;
249             int nStart = str.indexOf( "\"" , nFound );
250             if( nStart < 0 || str.indexOf( "'" , nFound ) < nStart )
251             {
252                 nStart = str.indexOf( "'" , nFound );
253                 nStop  = str.indexOf( "'" , nStart +1 );
254             }
255             else
256             {
257                 nStop  = str.indexOf( "\"" , nStart +1);
258             }
259             if( nStart >= 0 && nStop >= 0 && nStart+1 < nStop )
260             {
261                 // encoding found finally
262                 m_sEncoding = str.copy( nStart+1 , nStop - nStart - 1 );
263             }
264         }
265     }
266     else if( 0xFE == pSource[0] &&
267              0xFF == pSource[1] ) {
268         // UTF-16 big endian
269         // conversion is done so that encoding information can be easily extracted
270         m_sEncoding = "utf-16";
271     }
272     else if( 0xFF == pSource[0] &&
273              0xFE == pSource[1] ) {
274         // UTF-16 little endian
275         // conversion is done so that encoding information can be easily extracted
276         m_sEncoding = "utf-16";
277     }
278     else if( 0x00 == pSource[0] && 0x3c == pSource[1]  && 0x00 == pSource[2] && 0x3f == pSource[3] ) {
279         // UTF-16 big endian without byte order mark (this is (strictly speaking) an error.)
280         // The byte order mark is simply added
281 
282         // simply add the byte order mark !
283         seq.realloc( seq.getLength() + 2 );
284         memmove( &( seq.getArray()[2] ) , seq.getArray() , seq.getLength() - 2 );
285         ((sal_uInt8*)seq.getArray())[0] = 0xFE;
286         ((sal_uInt8*)seq.getArray())[1] = 0xFF;
287 
288         m_sEncoding = "utf-16";
289     }
290     else if( 0x3c == pSource[0] && 0x00 == pSource[1]  && 0x3f == pSource[2] && 0x00 == pSource[3] ) {
291         // UTF-16 little endian without byte order mark (this is (strictly speaking) an error.)
292         // The byte order mark is simply added
293 
294         seq.realloc( seq.getLength() + 2 );
295         memmove( &( seq.getArray()[2] ) , seq.getArray() , seq.getLength() - 2 );
296         ((sal_uInt8*)seq.getArray())[0] = 0xFF;
297         ((sal_uInt8*)seq.getArray())[1] = 0xFE;
298 
299         m_sEncoding = "utf-16";
300     }
301     else if( 0xEF == pSource[0] &&
302              0xBB == pSource[1] &&
303              0xBF == pSource[2] )
304     {
305         // UTF-8 BOM (byte order mark); signifies utf-8, and not byte order
306         // The BOM is removed.
307         memmove( seq.getArray(), &( seq.getArray()[3] ), seq.getLength()-3 );
308         seq.realloc( seq.getLength() - 3 );
309         m_sEncoding = "utf-8";
310     }
311     else if( 0x00 == pSource[0] && 0x00 == pSource[1]  && 0x00 == pSource[2] && 0x3c == pSource[3] ) {
312         // UCS-4 big endian
313         m_sEncoding = "ucs-4";
314     }
315     else if( 0x3c == pSource[0] && 0x00 == pSource[1]  && 0x00 == pSource[2] && 0x00 == pSource[3] ) {
316         // UCS-4 little endian
317         m_sEncoding = "ucs-4";
318     }
319     else if( 0x4c == pSource[0] && 0x6f == pSource[1]  &&
320              0xa7 == static_cast<unsigned char> (pSource[2]) &&
321              0x94 == static_cast<unsigned char> (pSource[3]) ) {
322         // EBCDIC
323         bReturn = sal_False;   // must be extended
324     }
325     else {
326         // other
327         // UTF8 is directly recognized by the parser.
328         bReturn = sal_False;
329     }
330 
331     return bReturn;
332 }
333 
initializeDecoding()334 void XMLFile2UTFConverter::initializeDecoding()
335 {
336 
337     if( m_sEncoding.getLength() )
338     {
339         rtl_TextEncoding encoding = rtl_getTextEncodingFromMimeCharset( m_sEncoding.getStr() );
340         if( encoding != RTL_TEXTENCODING_UTF8 )
341         {
342             m_pText2Unicode = new Text2UnicodeConverter( m_sEncoding );
343             m_pUnicode2Text = new Unicode2TextConverter( RTL_TEXTENCODING_UTF8 );
344         }
345     }
346 }
347 
348 
349 //----------------------------------------------
350 //
351 // Text2UnicodeConverter
352 //
353 //----------------------------------------------
Text2UnicodeConverter(const OString & sEncoding)354 Text2UnicodeConverter::Text2UnicodeConverter( const OString &sEncoding )
355 {
356     rtl_TextEncoding encoding = rtl_getTextEncodingFromMimeCharset( sEncoding.getStr() );
357     if( RTL_TEXTENCODING_DONTKNOW == encoding )
358     {
359         m_bCanContinue = sal_False;
360         m_bInitialized = sal_False;
361     }
362     else
363     {
364         init( encoding );
365     }
366 }
367 
~Text2UnicodeConverter()368 Text2UnicodeConverter::~Text2UnicodeConverter()
369 {
370     if( m_bInitialized )
371     {
372         rtl_destroyTextToUnicodeContext( m_convText2Unicode , m_contextText2Unicode );
373         rtl_destroyUnicodeToTextConverter( m_convText2Unicode );
374     }
375 }
376 
init(rtl_TextEncoding encoding)377 void Text2UnicodeConverter::init( rtl_TextEncoding encoding )
378 {
379     m_bCanContinue = sal_True;
380     m_bInitialized = sal_True;
381 
382     m_convText2Unicode  = rtl_createTextToUnicodeConverter(encoding);
383     m_contextText2Unicode = rtl_createTextToUnicodeContext( m_convText2Unicode );
384     m_rtlEncoding = encoding;
385 }
386 
387 
convert(const Sequence<sal_Int8> & seqText)388 Sequence<sal_Unicode> Text2UnicodeConverter::convert( const Sequence<sal_Int8> &seqText )
389 {
390     sal_uInt32 uiInfo;
391     sal_Size nSrcCvtBytes   = 0;
392     sal_Size nTargetCount   = 0;
393     sal_Size nSourceCount   = 0;
394 
395     // the whole source size
396     sal_Int32   nSourceSize = seqText.getLength() + m_seqSource.getLength();
397     Sequence<sal_Unicode>   seqUnicode ( nSourceSize );
398 
399     const sal_Int8 *pbSource = seqText.getConstArray();
400     sal_Int8 *pbTempMem = 0;
401 
402     if( m_seqSource.getLength() ) {
403         // put old rest and new byte sequence into one array
404         pbTempMem = new sal_Int8[ nSourceSize ];
405         memcpy( pbTempMem , m_seqSource.getConstArray() , m_seqSource.getLength() );
406         memcpy( &(pbTempMem[ m_seqSource.getLength() ]) , seqText.getConstArray() , seqText.getLength() );
407         pbSource = pbTempMem;
408 
409         // set to zero again
410         m_seqSource = Sequence< sal_Int8 >();
411     }
412 
413     while( sal_True ) {
414 
415         /* All invalid characters are transformed to the unicode undefined char */
416         nTargetCount +=     rtl_convertTextToUnicode(
417                                     m_convText2Unicode,
418                                     m_contextText2Unicode,
419                                     ( const sal_Char * ) &( pbSource[nSourceCount] ),
420                                     nSourceSize - nSourceCount ,
421                                     &( seqUnicode.getArray()[ nTargetCount ] ),
422                                     seqUnicode.getLength() - nTargetCount,
423                                     RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_DEFAULT   |
424                                     RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_DEFAULT |
425                                     RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT,
426                                     &uiInfo,
427                                     &nSrcCvtBytes );
428         nSourceCount += nSrcCvtBytes;
429 
430         if( uiInfo & RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL ) {
431             // save necessary bytes for next conversion
432             seqUnicode.realloc( seqUnicode.getLength() * 2 );
433             continue;
434         }
435         break;
436     }
437     if( uiInfo & RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL ) {
438         m_seqSource.realloc( nSourceSize - nSourceCount );
439         memcpy( m_seqSource.getArray() , &(pbSource[nSourceCount]) , nSourceSize-nSourceCount );
440     }
441 
442 
443     if( pbTempMem ) {
444         delete [] pbTempMem;
445     }
446 
447     // set to correct unicode size
448     seqUnicode.realloc( nTargetCount );
449 
450     return seqUnicode;
451 }
452 
453 
454 
455 //----------------------------------------------
456 //
457 // Unicode2TextConverter
458 //
459 //----------------------------------------------
Unicode2TextConverter(rtl_TextEncoding encoding)460 Unicode2TextConverter::Unicode2TextConverter( rtl_TextEncoding encoding )
461 {
462     init( encoding );
463 }
464 
465 
~Unicode2TextConverter()466 Unicode2TextConverter::~Unicode2TextConverter()
467 {
468     if( m_bInitialized ) {
469         rtl_destroyUnicodeToTextContext( m_convUnicode2Text , m_contextUnicode2Text );
470         rtl_destroyUnicodeToTextConverter( m_convUnicode2Text );
471     }
472 }
473 
474 
convert(const sal_Unicode * puSource,sal_Int32 nSourceSize)475 Sequence<sal_Int8> Unicode2TextConverter::convert(const sal_Unicode *puSource , sal_Int32 nSourceSize)
476 {
477     sal_Unicode *puTempMem = 0;
478 
479     if( m_seqSource.getLength() ) {
480         // For surrogates !
481         // put old rest and new byte sequence into one array
482         // In general when surrogates are used, they should be rarely
483         // cut off between two convert()-calls. So this code is used
484         // rarely and the extra copy is acceptable.
485         puTempMem = new sal_Unicode[ nSourceSize + m_seqSource.getLength()];
486         memcpy( puTempMem ,
487                 m_seqSource.getConstArray() ,
488                 m_seqSource.getLength() * sizeof( sal_Unicode ) );
489         memcpy(
490             &(puTempMem[ m_seqSource.getLength() ]) ,
491             puSource ,
492             nSourceSize*sizeof( sal_Unicode ) );
493         puSource = puTempMem;
494         nSourceSize += m_seqSource.getLength();
495 
496         m_seqSource = Sequence< sal_Unicode > ();
497     }
498 
499 
500     sal_Size nTargetCount = 0;
501     sal_Size nSourceCount = 0;
502 
503     sal_uInt32 uiInfo;
504     sal_Size nSrcCvtChars;
505 
506     // take nSourceSize * 3 as preference
507     // this is an upper boundary for converting to utf8,
508     // which most often used as the target.
509     sal_Int32 nSeqSize =  nSourceSize * 3;
510 
511     Sequence<sal_Int8>  seqText( nSeqSize );
512     sal_Char *pTarget = (sal_Char *) seqText.getArray();
513     while( sal_True ) {
514 
515         nTargetCount += rtl_convertUnicodeToText(
516                                     m_convUnicode2Text,
517                                     m_contextUnicode2Text,
518                                     &( puSource[nSourceCount] ),
519                                     nSourceSize - nSourceCount ,
520                                     &( pTarget[nTargetCount] ),
521                                     nSeqSize - nTargetCount,
522                                     RTL_UNICODETOTEXT_FLAGS_UNDEFINED_DEFAULT |
523                                     RTL_UNICODETOTEXT_FLAGS_INVALID_DEFAULT ,
524                                     &uiInfo,
525                                     &nSrcCvtChars);
526         nSourceCount += nSrcCvtChars;
527 
528         if( uiInfo & RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL ) {
529             nSeqSize = nSeqSize *2;
530             seqText.realloc( nSeqSize );  // double array size
531             pTarget = ( sal_Char * ) seqText.getArray();
532             continue;
533         }
534         break;
535     }
536 
537     // for surrogates
538     if( uiInfo & RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL ) {
539         m_seqSource.realloc( nSourceSize - nSourceCount );
540         memcpy( m_seqSource.getArray() ,
541                 &(puSource[nSourceCount]),
542                 (nSourceSize - nSourceCount) * sizeof( sal_Unicode ) );
543     }
544 
545     if( puTempMem ) {
546         delete [] puTempMem;
547     }
548 
549     // reduce the size of the buffer (fast, no copy necessary)
550     seqText.realloc( nTargetCount );
551 
552     return seqText;
553 }
554 
init(rtl_TextEncoding encoding)555 void Unicode2TextConverter::init( rtl_TextEncoding encoding )
556 {
557     m_bCanContinue = sal_True;
558     m_bInitialized = sal_True;
559 
560     m_convUnicode2Text  = rtl_createUnicodeToTextConverter( encoding );
561     m_contextUnicode2Text = rtl_createUnicodeToTextContext( m_convUnicode2Text );
562     m_rtlEncoding = encoding;
563 };
564 
565 
566 }
567