xref: /AOO41X/main/svtools/source/svrtf/parrtf.cxx (revision 93ce685fe327053ff7f749df54963ad0c4a7c3d3)
1 /**************************************************************
2  *
3  * Licensed to the Apache Software Foundation (ASF) under one
4  * or more contributor license agreements.  See the NOTICE file
5  * distributed with this work for additional information
6  * regarding copyright ownership.  The ASF licenses this file
7  * to you under the Apache License, Version 2.0 (the
8  * "License"); you may not use this file except in compliance
9  * with the License.  You may obtain a copy of the License at
10  *
11  *   http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing,
14  * software distributed under the License is distributed on an
15  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16  * KIND, either express or implied.  See the License for the
17  * specific language governing permissions and limitations
18  * under the License.
19  *
20  *************************************************************/
21 
22 
23 
24 // MARKER(update_precomp.py): autogen include statement, do not remove
25 #include "precompiled_svtools.hxx"
26 
27 /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil -*- */
28 
29 #include <stdio.h>                      // for EOF
30 #include <rtl/tencinfo.h>
31 #include <tools/stream.hxx>
32 #include <tools/debug.hxx>
33 #include <svtools/rtftoken.h>
34 #include <svtools/rtfkeywd.hxx>
35 #include <svtools/parrtf.hxx>
36 
37 const int MAX_STRING_LEN = 1024;
38 const int MAX_TOKEN_LEN = 128;
39 
40 #define RTF_ISDIGIT( c ) (c >= '0' && c <= '9')
41 #define RTF_ISALPHA( c ) ( (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') )
42 
SvRTFParser(SvStream & rIn,sal_uInt8 nStackSize)43 SvRTFParser::SvRTFParser( SvStream& rIn, sal_uInt8 nStackSize )
44     : SvParser( rIn, nStackSize ),
45     eUNICodeSet( RTL_TEXTENCODING_MS_1252 ),    // default ist ANSI-CodeSet
46     nUCharOverread( 1 )
47 {
48     // default ist ANSI-CodeSet
49     SetSrcEncoding( RTL_TEXTENCODING_MS_1252 );
50     bRTF_InTextRead = false;
51 }
52 
~SvRTFParser()53 SvRTFParser::~SvRTFParser()
54 {
55 }
56 
57 
58 
59 
_GetNextToken()60 int SvRTFParser::_GetNextToken()
61 {
62     int nRet = 0;
63     do {
64         int bNextCh = true;
65         switch( nNextCh )
66         {
67         case '\\':
68             {
69                 // Steuerzeichen
70                 switch( nNextCh = GetNextChar() )
71                 {
72                 case '{':
73                 case '}':
74                 case '\\':
75                 case '+':       // habe ich in einem RTF-File gefunden
76                 case '~':       // nonbreaking space
77                 case '-':       // optional hyphen
78                 case '_':       // nonbreaking hyphen
79                 case '\'':      // HexValue
80                     nNextCh = '\\';
81                     rInput.SeekRel( -1 );
82                     ScanText();
83                     nRet = RTF_TEXTTOKEN;
84                     bNextCh = 0 == nNextCh;
85                     break;
86 
87                 case '*':       // ignoreflag
88                     nRet = RTF_IGNOREFLAG;
89                     break;
90                 case ':':       // subentry in an index entry
91                     nRet = RTF_SUBENTRYINDEX;
92                     break;
93                 case '|':       // formula-charakter
94                     nRet = RTF_FORMULA;
95                     break;
96 
97                 case 0x0a:
98                 case 0x0d:
99                     nRet = RTF_PAR;
100                     break;
101 
102                 default:
103                     if( RTF_ISALPHA( nNextCh ) )
104                     {
105                         aToken = '\\';
106                         {
107                             String aStrBuffer;
108                             sal_Unicode* pStr = aStrBuffer.AllocBuffer(
109                                                             MAX_TOKEN_LEN );
110                             xub_StrLen nStrLen = 0;
111                             do {
112                                 *(pStr + nStrLen++) = nNextCh;
113                                 if( MAX_TOKEN_LEN == nStrLen )
114                                 {
115                                     aToken += aStrBuffer;
116                                     aToken.GetBufferAccess();  // make unique string!
117                                     nStrLen = 0;
118                                 }
119                                 nNextCh = GetNextChar();
120                             } while( RTF_ISALPHA( nNextCh ) );
121                             if( nStrLen )
122                             {
123                                 aStrBuffer.ReleaseBufferAccess( nStrLen );
124                                 aToken += aStrBuffer;
125                             }
126                         }
127 
128                         // Minus fuer numerischen Parameter
129                         int bNegValue = false;
130                         if( '-' == nNextCh )
131                         {
132                             bNegValue = true;
133                             nNextCh = GetNextChar();
134                         }
135 
136                         // evt. Numerischer Parameter
137                         if( RTF_ISDIGIT( nNextCh ) )
138                         {
139                             nTokenValue = 0;
140                             do {
141                                 nTokenValue *= 10;
142                                 nTokenValue += nNextCh - '0';
143                                 nNextCh = GetNextChar();
144                             } while( RTF_ISDIGIT( nNextCh ) );
145                             if( bNegValue )
146                                 nTokenValue = -nTokenValue;
147                             bTokenHasValue=true;
148                         }
149                         else if( bNegValue )        // das Minus wieder zurueck
150                         {
151                             nNextCh = '-';
152                             rInput.SeekRel( -1 );
153                         }
154                         if( ' ' == nNextCh )        // Blank gehoert zum Token!
155                             nNextCh = GetNextChar();
156 
157                         // suche das Token in der Tabelle:
158                         if( 0 == (nRet = GetRTFToken( aToken )) )
159                             // Unknown Control
160                             nRet = RTF_UNKNOWNCONTROL;
161 
162                         // bug 76812 - unicode token handled as normal text
163                         bNextCh = false;
164                         switch( nRet )
165                         {
166                         case RTF_UC:
167                             if( 0 <= nTokenValue )
168                             {
169                                 nUCharOverread = (sal_uInt8)nTokenValue;
170 #if 1
171                                 //cmc: other ifdef breaks #i3584
172                                 aParserStates.top().
173                                     nUCharOverread = nUCharOverread;
174 #else
175                                 if( !nUCharOverread )
176                                     nUCharOverread = aParserStates.top().nUCharOverread;
177                                 else
178                                     aParserStates.top().
179                                         nUCharOverread = nUCharOverread;
180 #endif
181                             }
182                             aToken.Erase(); // #i47831# erase token to prevent the token from beeing treated as text
183                             // read next token
184                             nRet = 0;
185                             break;
186 
187                         case RTF_UPR:
188                             if (!_inSkipGroup) {
189                             // UPR - overread the group with the ansi
190                             //       informations
191                             while( '{' != _GetNextToken() )
192                                 ;
193                             SkipGroup();
194                             _GetNextToken();  // overread the last bracket
195                             nRet = 0;
196                             }
197                             break;
198 
199                         case RTF_U:
200                             if( !bRTF_InTextRead )
201                             {
202                                 nRet = RTF_TEXTTOKEN;
203                                 aToken = (sal_Unicode)nTokenValue;
204 
205                                 // overread the next n "RTF" characters. This
206                                 // can be also \{, \}, \'88
207                                 for( sal_uInt8 m = 0; m < nUCharOverread; ++m )
208                                 {
209                                     sal_Unicode cAnsi = nNextCh;
210                                     while( 0xD == cAnsi )
211                                         cAnsi = GetNextChar();
212                                     while( 0xA == cAnsi )
213                                         cAnsi = GetNextChar();
214 
215                                     if( '\\' == cAnsi &&
216                                         '\'' == ( cAnsi = GetNextChar() ))
217                                         // HexValue ueberlesen
218                                         cAnsi = GetHexValue();
219                                     nNextCh = GetNextChar();
220                                 }
221                                 ScanText();
222                                 bNextCh = 0 == nNextCh;
223                             }
224                             break;
225                         }
226                     }
227                     else if( SVPAR_PENDING != eState )
228                     {
229                         // Bug 34631 - "\ " ueberlesen - Blank als Zeichen
230                         // eState = SVPAR_ERROR;
231                         bNextCh = false;
232                     }
233                     break;
234                 }
235             }
236             break;
237 
238         case sal_Unicode(EOF):
239             eState = SVPAR_ACCEPTED;
240             nRet = nNextCh;
241             break;
242 
243         case '{':
244             {
245                 if( 0 <= nOpenBrakets )
246                 {
247                     RtfParserState_Impl aState( nUCharOverread, GetSrcEncoding() );
248                     aParserStates.push( aState );
249                 }
250                 ++nOpenBrakets;
251                 DBG_ASSERT(
252                     static_cast<size_t>(nOpenBrakets) == aParserStates.size(),
253                     "ParserStateStack unequal to bracket count" );
254                 nRet = nNextCh;
255             }
256             break;
257 
258         case '}':
259             --nOpenBrakets;
260             if( 0 <= nOpenBrakets )
261             {
262                 aParserStates.pop();
263                 if( !aParserStates.empty() )
264                 {
265                     const RtfParserState_Impl& rRPS =
266                             aParserStates.top();
267                     nUCharOverread = rRPS.nUCharOverread;
268                     SetSrcEncoding( rRPS.eCodeSet );
269                 }
270                 else
271                 {
272                     nUCharOverread = 1;
273                     SetSrcEncoding( GetCodeSet() );
274                 }
275             }
276             DBG_ASSERT(
277                 static_cast<size_t>(nOpenBrakets) == aParserStates.size(),
278                 "ParserStateStack unequal to bracket count" );
279             nRet = nNextCh;
280             break;
281 
282         case 0x0d:
283         case 0x0a:
284             break;
285 
286         default:
287             // es folgt normaler Text
288             ScanText();
289             nRet = RTF_TEXTTOKEN;
290             bNextCh = 0 == nNextCh;
291             break;
292         }
293 
294         if( bNextCh )
295             nNextCh = GetNextChar();
296 
297     } while( !nRet && SVPAR_WORKING == eState );
298     return nRet;
299 }
300 
301 
GetHexValue()302 sal_Unicode SvRTFParser::GetHexValue()
303 {
304     // Hex-Wert sammeln
305     register int n;
306     register sal_Unicode nHexVal = 0;
307 
308     for( n = 0; n < 2; ++n )
309     {
310         nHexVal *= 16;
311         nNextCh = GetNextChar();
312         if( nNextCh >= '0' && nNextCh <= '9' )
313             nHexVal += (nNextCh - 48);
314         else if( nNextCh >= 'a' && nNextCh <= 'f' )
315             nHexVal += (nNextCh - 87);
316         else if( nNextCh >= 'A' && nNextCh <= 'F' )
317             nHexVal += (nNextCh - 55);
318     }
319     return nHexVal;
320 }
321 
ScanText(const sal_Unicode cBreak)322 void SvRTFParser::ScanText( const sal_Unicode cBreak )
323 {
324     String aStrBuffer;
325     int bWeiter = true;
326     while( bWeiter && IsParserWorking() && aStrBuffer.Len() < MAX_STRING_LEN)
327     {
328         int bNextCh = true;
329         switch( nNextCh )
330         {
331         case '\\':
332             {
333                 switch (nNextCh = GetNextChar())
334                 {
335                 case '\'':
336                     {
337 
338 #if 0
339                         // #i35653 patch from cmc
340                         ByteString aByteString(static_cast<char>(GetHexValue()));
341                         if (aByteString.Len())
342                             aStrBuffer.Append(String(aByteString, GetSrcEncoding()));
343 #else
344                         ByteString aByteString;
345                         while (1)
346                         {
347                             aByteString.Append((char)GetHexValue());
348 
349                             bool bBreak = false;
350                             sal_Char nSlash = '\\';
351                             while (!bBreak)
352                             {
353                                 wchar_t __next=GetNextChar();
354                                 if (__next>0xFF) // fix for #i43933# and #i35653#
355                                 {
356                                     if (aByteString.Len())
357                                         aStrBuffer.Append(String(aByteString, GetSrcEncoding()));
358                                     aStrBuffer.Append((sal_Unicode)__next);
359 
360                                     aByteString.Erase();
361                                     continue;
362                                 }
363                                 nSlash = (sal_Char)__next;
364                                 while (nSlash == 0xD || nSlash == 0xA)
365                                     nSlash = (sal_Char)GetNextChar();
366 
367                                 switch (nSlash)
368                                 {
369                                     case '{':
370                                     case '}':
371                                     case '\\':
372                                         bBreak = true;
373                                         break;
374                                     default:
375                                         aByteString.Append(nSlash);
376                                         break;
377                                 }
378                             }
379 
380                             nNextCh = GetNextChar();
381 
382                             if (nSlash != '\\' || nNextCh != '\'')
383                             {
384                                 rInput.SeekRel(-1);
385                                 nNextCh = nSlash;
386                                 break;
387                             }
388                         }
389 
390                         bNextCh = false;
391 
392                         if (aByteString.Len())
393                             aStrBuffer.Append(String(aByteString, GetSrcEncoding()));
394 #endif
395                     }
396                     break;
397                 case '\\':
398                 case '}':
399                 case '{':
400                 case '+':       // habe ich in einem RTF-File gefunden
401                     aStrBuffer.Append(nNextCh);
402                     break;
403                 case '~':       // nonbreaking space
404                     aStrBuffer.Append(static_cast< sal_Unicode >(0xA0));
405                     break;
406                 case '-':       // optional hyphen
407                     aStrBuffer.Append(static_cast< sal_Unicode >(0xAD));
408                     break;
409                 case '_':       // nonbreaking hyphen
410                     aStrBuffer.Append(static_cast< sal_Unicode >(0x2011));
411                     break;
412 
413                 case 'u':
414                     // UNI-Code Zeichen lesen
415                     {
416                         nNextCh = GetNextChar();
417                         rInput.SeekRel( -2 );
418 
419                         if( '-' == nNextCh || RTF_ISDIGIT( nNextCh ) )
420                         {
421                             bRTF_InTextRead = true;
422 
423                             String sSave( aToken );
424                             nNextCh = '\\';
425                             #ifdef DBG_UTIL
426                             int nToken =
427                             #endif
428                                 _GetNextToken();
429                             DBG_ASSERT( RTF_U == nToken, "doch kein UNI-Code Zeichen" );
430                             // dont convert symbol chars
431                             aStrBuffer.Append(
432                                 static_cast< sal_Unicode >(nTokenValue));
433 
434                             // overread the next n "RTF" characters. This
435                             // can be also \{, \}, \'88
436                             for( sal_uInt8 m = 0; m < nUCharOverread; ++m )
437                             {
438                                 sal_Unicode cAnsi = nNextCh;
439                                 while( 0xD == cAnsi )
440                                     cAnsi = GetNextChar();
441                                 while( 0xA == cAnsi )
442                                     cAnsi = GetNextChar();
443 
444                                 if( '\\' == cAnsi &&
445                                     '\'' == ( cAnsi = GetNextChar() ))
446                                     // HexValue ueberlesen
447                                     cAnsi = GetHexValue();
448                                 nNextCh = GetNextChar();
449                             }
450                             bNextCh = false;
451                             aToken = sSave;
452                             bRTF_InTextRead = false;
453                         }
454                         else
455                         {
456                             nNextCh = '\\';
457                             bWeiter = false;        // Abbrechen, String zusammen
458                         }
459                     }
460                     break;
461 
462                 default:
463                     rInput.SeekRel( -1 );
464                     nNextCh = '\\';
465                     bWeiter = false;        // Abbrechen, String zusammen
466                     break;
467                 }
468             }
469             break;
470 
471         case sal_Unicode(EOF):
472                 eState = SVPAR_ERROR;
473                 // weiter
474         case '{':
475         case '}':
476             bWeiter = false;
477             break;
478 
479         case 0x0a:
480         case 0x0d:
481             break;
482 
483         default:
484             if( nNextCh == cBreak || aStrBuffer.Len() >= MAX_STRING_LEN)
485                 bWeiter = false;
486             else
487             {
488                 do {
489                     // alle anderen Zeichen kommen in den Text
490                     aStrBuffer.Append(nNextCh);
491 
492                     if (sal_Unicode(EOF) == (nNextCh = GetNextChar()))
493                     {
494                         if (aStrBuffer.Len())
495                             aToken += aStrBuffer;
496                         return;
497                     }
498                 } while
499                 (
500                     (RTF_ISALPHA(nNextCh) || RTF_ISDIGIT(nNextCh)) &&
501                     (aStrBuffer.Len() < MAX_STRING_LEN)
502                 );
503                 bNextCh = false;
504             }
505         }
506 
507         if( bWeiter && bNextCh )
508             nNextCh = GetNextChar();
509     }
510 
511     if (aStrBuffer.Len())
512         aToken += aStrBuffer;
513 }
514 
515 
516 short SvRTFParser::_inSkipGroup=0;
517 
SkipGroup()518 void SvRTFParser::SkipGroup()
519 {
520 short nBrackets=1;
521 if (_inSkipGroup>0)
522     return;
523 _inSkipGroup++;
524 #if 1   //#i16185# fecking \bin keyword
525     do
526     {
527         switch (nNextCh)
528         {
529             case '{':
530                 ++nBrackets;
531                 break;
532             case '}':
533                 if (!--nBrackets) {
534                     _inSkipGroup--;
535                     return;
536                 }
537                 break;
538         }
539         int nToken = _GetNextToken();
540         if (nToken == RTF_BIN)
541         {
542             rInput.SeekRel(-1);
543             rInput.SeekRel(nTokenValue);
544             nNextCh = GetNextChar();
545         }
546         while (nNextCh==0xa || nNextCh==0xd)
547         {
548             nNextCh = GetNextChar();
549         }
550     } while (sal_Unicode(EOF) != nNextCh && IsParserWorking());
551 #else
552     sal_Unicode cPrev = 0;
553     do {
554         switch( nNextCh )
555         {
556         case '{':
557             if( '\\' != cPrev )
558                 ++nBrackets;
559             break;
560 
561         case '}':
562             if( '\\' != cPrev && !--nBrackets )
563                 return;
564             break;
565 
566         case '\\':
567             if( '\\' == cPrev )
568                 nNextCh = 0;
569             break;
570         }
571         cPrev = nNextCh;
572         nNextCh = GetNextChar();
573     } while( sal_Unicode(EOF) != nNextCh && IsParserWorking() );
574 #endif
575 
576     if( SVPAR_PENDING != eState && '}' != nNextCh )
577         eState = SVPAR_ERROR;
578     _inSkipGroup--;
579 }
580 
ReadUnknownData()581 void SvRTFParser::ReadUnknownData() { SkipGroup(); }
ReadBitmapData()582 void SvRTFParser::ReadBitmapData()  { SkipGroup(); }
ReadOLEData()583 void SvRTFParser::ReadOLEData()     { SkipGroup(); }
584 
585 
CallParser()586 SvParserState SvRTFParser::CallParser()
587 {
588     sal_Char cFirstCh;
589     nNextChPos = rInput.Tell();
590     rInput >> cFirstCh; nNextCh = cFirstCh;
591     eState = SVPAR_WORKING;
592     nOpenBrakets = 0;
593     SetSrcEncoding( eCodeSet = RTL_TEXTENCODING_MS_1252 );
594     eUNICodeSet = RTL_TEXTENCODING_MS_1252;     // default ist ANSI-CodeSet
595 
596     // die 1. beiden Token muessen '{' und \\rtf sein !!
597     if( '{' == GetNextToken() && RTF_RTF == GetNextToken() )
598     {
599         AddRef();
600         Continue( 0 );
601         if( SVPAR_PENDING != eState )
602             ReleaseRef();       // dann brauchen wir den Parser nicht mehr!
603     }
604     else
605         eState = SVPAR_ERROR;
606 
607     return eState;
608 }
609 
Continue(int nToken)610 void SvRTFParser::Continue( int nToken )
611 {
612 //  DBG_ASSERT( SVPAR_CS_DONTKNOW == GetCharSet(),
613 //              "Zeichensatz wurde geaendert." );
614 
615     if( !nToken )
616         nToken = GetNextToken();
617 
618     while( IsParserWorking() )
619     {
620         SaveState( nToken );
621         switch( nToken )
622         {
623         case '}':
624             if( nOpenBrakets )
625                 goto NEXTTOKEN;
626             eState = SVPAR_ACCEPTED;
627             break;
628 
629         case '{':
630             // eine unbekannte Gruppe ?
631             {
632                 if( RTF_IGNOREFLAG != GetNextToken() )
633                     nToken = SkipToken( -1 );
634                 else if( RTF_UNKNOWNCONTROL != GetNextToken() )
635                     nToken = SkipToken( -2 );
636                 else
637                 {
638                     // gleich herausfiltern
639                     ReadUnknownData();
640                     nToken = GetNextToken();
641                     if( '}' != nToken )
642                         eState = SVPAR_ERROR;
643                     break;      // auf zum naechsten Token!!
644                 }
645             }
646             goto NEXTTOKEN;
647 
648         case RTF_UNKNOWNCONTROL:
649             break;      // unbekannte Token ueberspringen
650         case RTF_NEXTTYPE:
651         case RTF_ANSITYPE:
652             SetEncoding( eCodeSet = RTL_TEXTENCODING_MS_1252 );
653             break;
654         case RTF_MACTYPE:
655             SetEncoding( eCodeSet = RTL_TEXTENCODING_APPLE_ROMAN );
656             break;
657         case RTF_PCTYPE:
658             SetEncoding( eCodeSet = RTL_TEXTENCODING_IBM_437 );
659             break;
660         case RTF_PCATYPE:
661             SetEncoding( eCodeSet = RTL_TEXTENCODING_IBM_850 );
662             break;
663         case RTF_ANSICPG:
664             eCodeSet = rtl_getTextEncodingFromWindowsCodePage(nTokenValue);
665             SetEncoding(eCodeSet);
666             break;
667         default:
668 NEXTTOKEN:
669             NextToken( nToken );
670             break;
671         }
672         if( IsParserWorking() )
673             SaveState( 0 );         // bis hierhin abgearbeitet,
674                                     // weiter mit neuem Token!
675         nToken = GetNextToken();
676     }
677     if( SVPAR_ACCEPTED == eState && 0 < nOpenBrakets )
678         eState = SVPAR_ERROR;
679 }
680 
SetEncoding(rtl_TextEncoding eEnc)681 void SvRTFParser::SetEncoding( rtl_TextEncoding eEnc )
682 {
683     if (eEnc == RTL_TEXTENCODING_DONTKNOW)
684         eEnc = GetCodeSet();
685 
686     if (!aParserStates.empty())
687         aParserStates.top().eCodeSet = eEnc;
688     SetSrcEncoding(eEnc);
689 }
690 
691 #ifdef USED
SaveState(int nToken)692 void SvRTFParser::SaveState( int nToken )
693 {
694     SvParser::SaveState( nToken );
695 }
696 
RestoreState()697 void SvRTFParser::RestoreState()
698 {
699     SvParser::RestoreState();
700 }
701 #endif
702 
703 /* vi:set tabstop=4 shiftwidth=4 expandtab: */
704