xref: /AOO41X/main/svtools/source/edit/syntaxhighlight.cxx (revision a9ab3c7b3d31474a75bf54404ada03e2f02464cb)
1 /**************************************************************
2  *
3  * Licensed to the Apache Software Foundation (ASF) under one
4  * or more contributor license agreements.  See the NOTICE file
5  * distributed with this work for additional information
6  * regarding copyright ownership.  The ASF licenses this file
7  * to you under the Apache License, Version 2.0 (the
8  * "License"); you may not use this file except in compliance
9  * with the License.  You may obtain a copy of the License at
10  *
11  *   http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing,
14  * software distributed under the License is distributed on an
15  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16  * KIND, either express or implied.  See the License for the
17  * specific language governing permissions and limitations
18  * under the License.
19  *
20  *************************************************************/
21 
22 
23 
24 // MARKER(update_precomp.py): autogen include statement, do not remove
25 #include "precompiled_svtools.hxx"
26 
27 #include <svtools/syntaxhighlight.hxx>
28 
29 #include <unotools/charclass.hxx>
30 #include <tools/debug.hxx>
31 
32 
33 // ##########################################################################
34 // ATTENTION: all these words needs to be in small caps
35 // ##########################################################################
36 static const char* strListBasicKeyWords[] = {
37     "access",
38     "alias",
39     "and",
40     "any",
41     "append",
42     "as",
43     "base",
44     "binary",
45     "boolean",
46     "byref",
47     "byte",
48     "byval",
49     "call",
50     "case",
51     "cdecl",
52     "classmodule",
53     "close",
54     "compare",
55     "compatible",
56     "const",
57     "currency",
58     "date",
59     "declare",
60     "defbool",
61     "defcur",
62     "defdate",
63     "defdbl",
64     "deferr",
65     "defint",
66     "deflng",
67     "defobj",
68     "defsng",
69     "defstr",
70     "defvar",
71     "dim",
72     "do",
73     "double",
74     "each",
75     "else",
76     "elseif",
77     "end",
78     "end enum",
79     "end function",
80     "end if",
81     "end select",
82     "end sub",
83     "end type",
84     "endif",
85     "enum",
86     "eqv",
87     "erase",
88     "error",
89     "exit",
90     "explicit",
91     "for",
92     "function",
93     "get",
94     "global",
95     "gosub",
96     "goto",
97     "if",
98     "imp",
99     "implements",
100     "in",
101     "input",
102     "integer",
103     "is",
104     "let",
105     "lib",
106     "like",
107     "line",
108     "line input",
109     "local",
110     "lock",
111     "long",
112     "loop",
113     "lprint",
114     "lset",
115     "mod",
116     "name",
117     "new",
118     "next",
119     "not",
120     "object",
121     "on",
122     "open",
123     "option",
124     "optional",
125     "or",
126     "output",
127     "preserve",
128     "print",
129     "private",
130     "property",
131     "public",
132     "random",
133     "read",
134     "redim",
135     "rem",
136     "resume",
137     "return",
138     "rset",
139     "select",
140     "set",
141     "shared",
142     "single",
143     "static",
144     "step",
145     "stop",
146     "string",
147     "sub",
148     "system",
149     "text",
150     "then",
151     "to",
152     "type",
153     "typeof",
154     "until",
155     "variant",
156     "wend",
157     "while",
158     "with",
159     "write",
160     "xor"
161 };
162 
163 
164 static const char* strListSqlKeyWords[] = {
165     "all",
166     "and",
167     "any",
168     "as",
169     "asc",
170     "avg",
171     "between",
172     "by",
173     "cast",
174     "corresponding",
175     "count",
176     "create",
177     "cross",
178     "delete",
179     "desc",
180     "distinct",
181     "drop",
182     "escape",
183     "except",
184     "exists",
185     "false",
186     "from",
187     "full",
188     "global",
189     "group",
190     "having",
191     "in",
192     "inner",
193     "insert",
194     "intersect",
195     "into",
196     "is",
197     "join",
198     "left",
199     "like",
200     "local",
201     "match",
202     "max",
203     "min",
204     "natural",
205     "not",
206     "null",
207     "on",
208     "or",
209     "order",
210     "outer",
211     "right",
212     "select",
213     "set",
214     "some",
215     "sum",
216     "table",
217     "temporary",
218     "true",
219     "union",
220     "unique",
221     "unknown",
222     "update",
223     "using",
224     "values",
225     "where"
226 };
227 
228 
compare_strings(const void * arg1,const void * arg2)229 extern "C" int CDECL compare_strings( const void *arg1, const void *arg2 )
230 {
231     return strcmp( (char *)arg1, *(char **)arg2 );
232 }
233 
234 
235 class LetterTable
236 {
237     bool        IsLetterTab[256];
238 
239 public:
240     LetterTable( void );
241 
isLetter(sal_Unicode c)242     inline bool isLetter( sal_Unicode c )
243     {
244         bool bRet = (c < 256) ? IsLetterTab[c] : isLetterUnicode( c );
245         return bRet;
246     }
247     bool isLetterUnicode( sal_Unicode c );
248 };
249 
250 class BasicSimpleCharClass
251 {
252     static LetterTable aLetterTable;
253 
254 public:
isAlpha(sal_Unicode c,bool bCompatible)255     static sal_Bool isAlpha( sal_Unicode c, bool bCompatible )
256     {
257         sal_Bool bRet = (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
258                     || (bCompatible && aLetterTable.isLetter( c ));
259         return bRet;
260     }
261 
isDigit(sal_Unicode c)262     static sal_Bool isDigit( sal_Unicode c )
263     {
264         sal_Bool bRet = (c >= '0' && c <= '9');
265         return bRet;
266     }
267 
isAlphaNumeric(sal_Unicode c,bool bCompatible)268     static sal_Bool isAlphaNumeric( sal_Unicode c, bool bCompatible )
269     {
270         sal_Bool bRet = isDigit( c ) || isAlpha( c, bCompatible );
271         return bRet;
272     }
273 };
274 
275 LetterTable BasicSimpleCharClass::aLetterTable;
276 
LetterTable(void)277 LetterTable::LetterTable( void )
278 {
279     for( int i = 0 ; i < 256 ; ++i )
280         IsLetterTab[i] = false;
281 
282     IsLetterTab[0xC0] = true;   // ?, CAPITAL LETTER A WITH GRAVE ACCENT
283     IsLetterTab[0xC1] = true;   // ?, CAPITAL LETTER A WITH ACUTE ACCENT
284     IsLetterTab[0xC2] = true;   // ?, CAPITAL LETTER A WITH CIRCUMFLEX ACCENT
285     IsLetterTab[0xC3] = true;   // ?, CAPITAL LETTER A WITH TILDE
286     IsLetterTab[0xC4] = true;   // ?, CAPITAL LETTER A WITH DIAERESIS
287     IsLetterTab[0xC5] = true;   // ?, CAPITAL LETTER A WITH RING ABOVE
288     IsLetterTab[0xC6] = true;   // ?, CAPITAL LIGATURE AE
289     IsLetterTab[0xC7] = true;   // ?, CAPITAL LETTER C WITH CEDILLA
290     IsLetterTab[0xC8] = true;   // ?, CAPITAL LETTER E WITH GRAVE ACCENT
291     IsLetterTab[0xC9] = true;   // ?, CAPITAL LETTER E WITH ACUTE ACCENT
292     IsLetterTab[0xCA] = true;   // ?, CAPITAL LETTER E WITH CIRCUMFLEX ACCENT
293     IsLetterTab[0xCB] = true;   // ?, CAPITAL LETTER E WITH DIAERESIS
294     IsLetterTab[0xCC] = true;   // ?, CAPITAL LETTER I WITH GRAVE ACCENT
295     IsLetterTab[0xCD] = true;   // ?, CAPITAL LETTER I WITH ACUTE ACCENT
296     IsLetterTab[0xCE] = true;   // ?, CAPITAL LETTER I WITH CIRCUMFLEX ACCENT
297     IsLetterTab[0xCF] = true;   // ?, CAPITAL LETTER I WITH DIAERESIS
298     IsLetterTab[0xD0] = true;   // ?, CAPITAL LETTER ETH
299     IsLetterTab[0xD1] = true;   // ?, CAPITAL LETTER N WITH TILDE
300     IsLetterTab[0xD2] = true;   // ?, CAPITAL LETTER O WITH GRAVE ACCENT
301     IsLetterTab[0xD3] = true;   // ?, CAPITAL LETTER O WITH ACUTE ACCENT
302     IsLetterTab[0xD4] = true;   // ?, CAPITAL LETTER O WITH CIRCUMFLEX ACCENT
303     IsLetterTab[0xD5] = true;   // ?, CAPITAL LETTER O WITH TILDE
304     IsLetterTab[0xD6] = true;   // ?, CAPITAL LETTER O WITH DIAERESIS
305     IsLetterTab[0xD8] = true;   // ?, CAPITAL LETTER O WITH STROKE
306     IsLetterTab[0xD9] = true;   // ?, CAPITAL LETTER U WITH GRAVE ACCENT
307     IsLetterTab[0xDA] = true;   // ?, CAPITAL LETTER U WITH ACUTE ACCENT
308     IsLetterTab[0xDB] = true;   // ?, CAPITAL LETTER U WITH CIRCUMFLEX ACCENT
309     IsLetterTab[0xDC] = true;   // ?, CAPITAL LETTER U WITH DIAERESIS
310     IsLetterTab[0xDD] = true;   // ?, CAPITAL LETTER Y WITH ACUTE ACCENT
311     IsLetterTab[0xDE] = true;   // ?, CAPITAL LETTER THORN
312     IsLetterTab[0xDF] = true;   // ?, SMALL LETTER SHARP S
313     IsLetterTab[0xE0] = true;   // ?, SMALL LETTER A WITH GRAVE ACCENT
314     IsLetterTab[0xE1] = true;   // ?, SMALL LETTER A WITH ACUTE ACCENT
315     IsLetterTab[0xE2] = true;   // ?, SMALL LETTER A WITH CIRCUMFLEX ACCENT
316     IsLetterTab[0xE3] = true;   // ?, SMALL LETTER A WITH TILDE
317     IsLetterTab[0xE4] = true;   // ?, SMALL LETTER A WITH DIAERESIS
318     IsLetterTab[0xE5] = true;   // ?, SMALL LETTER A WITH RING ABOVE
319     IsLetterTab[0xE6] = true;   // ?, SMALL LIGATURE AE
320     IsLetterTab[0xE7] = true;   // ?, SMALL LETTER C WITH CEDILLA
321     IsLetterTab[0xE8] = true;   // ?, SMALL LETTER E WITH GRAVE ACCENT
322     IsLetterTab[0xE9] = true;   // ?, SMALL LETTER E WITH ACUTE ACCENT
323     IsLetterTab[0xEA] = true;   // ?, SMALL LETTER E WITH CIRCUMFLEX ACCENT
324     IsLetterTab[0xEB] = true;   // ?, SMALL LETTER E WITH DIAERESIS
325     IsLetterTab[0xEC] = true;   // ?, SMALL LETTER I WITH GRAVE ACCENT
326     IsLetterTab[0xED] = true;   // ?, SMALL LETTER I WITH ACUTE ACCENT
327     IsLetterTab[0xEE] = true;   // ?, SMALL LETTER I WITH CIRCUMFLEX ACCENT
328     IsLetterTab[0xEF] = true;   // ?, SMALL LETTER I WITH DIAERESIS
329     IsLetterTab[0xF0] = true;   // ?, SMALL LETTER ETH
330     IsLetterTab[0xF1] = true;   // ?, SMALL LETTER N WITH TILDE
331     IsLetterTab[0xF2] = true;   // ?, SMALL LETTER O WITH GRAVE ACCENT
332     IsLetterTab[0xF3] = true;   // ?, SMALL LETTER O WITH ACUTE ACCENT
333     IsLetterTab[0xF4] = true;   // ?, SMALL LETTER O WITH CIRCUMFLEX ACCENT
334     IsLetterTab[0xF5] = true;   // ?, SMALL LETTER O WITH TILDE
335     IsLetterTab[0xF6] = true;   // ?, SMALL LETTER O WITH DIAERESIS
336     IsLetterTab[0xF8] = true;   // ?, SMALL LETTER O WITH OBLIQUE BAR
337     IsLetterTab[0xF9] = true;   // ?, SMALL LETTER U WITH GRAVE ACCENT
338     IsLetterTab[0xFA] = true;   // ?, SMALL LETTER U WITH ACUTE ACCENT
339     IsLetterTab[0xFB] = true;   // ?, SMALL LETTER U WITH CIRCUMFLEX ACCENT
340     IsLetterTab[0xFC] = true;   // ?, SMALL LETTER U WITH DIAERESIS
341     IsLetterTab[0xFD] = true;   // ?, SMALL LETTER Y WITH ACUTE ACCENT
342     IsLetterTab[0xFE] = true;   // ?, SMALL LETTER THORN
343     IsLetterTab[0xFF] = true;   // � , SMALL LETTER Y WITH DIAERESIS
344 }
345 
isLetterUnicode(sal_Unicode c)346 bool LetterTable::isLetterUnicode( sal_Unicode c )
347 {
348     static CharClass* pCharClass = NULL;
349     if( pCharClass == NULL )
350         pCharClass = new CharClass( Application::GetSettings().GetLocale() );
351     String aStr( c );
352     bool bRet = pCharClass->isLetter( aStr, 0 );
353     return bRet;
354 }
355 
356 // Hilfsfunktion: Zeichen-Flag Testen
testCharFlags(sal_Unicode c,sal_uInt16 nTestFlags)357 sal_Bool SimpleTokenizer_Impl::testCharFlags( sal_Unicode c, sal_uInt16 nTestFlags )
358 {
359     bool bRet = false;
360     if( c != 0 && c <= 255 )
361     {
362         bRet = ( (aCharTypeTab[c] & nTestFlags) != 0 );
363     }
364     else if( c > 255 )
365     {
366         bRet = (( CHAR_START_IDENTIFIER | CHAR_IN_IDENTIFIER ) & nTestFlags) != 0
367             ? BasicSimpleCharClass::isAlpha( c, true ) : false;
368     }
369     return bRet;
370 }
371 
setKeyWords(const char ** ppKeyWords,sal_uInt16 nCount)372 void SimpleTokenizer_Impl::setKeyWords( const char** ppKeyWords, sal_uInt16 nCount )
373 {
374     ppListKeyWords = ppKeyWords;
375     nKeyWordCount = nCount;
376 }
377 
378 // Neues Token holen
getNextToken(TokenTypes & reType,const sal_Unicode * & rpStartPos,const sal_Unicode * & rpEndPos)379 sal_Bool SimpleTokenizer_Impl::getNextToken( /*out*/TokenTypes& reType,
380     /*out*/const sal_Unicode*& rpStartPos, /*out*/const sal_Unicode*& rpEndPos )
381 {
382     reType = TT_UNKNOWN;
383 
384     // Position merken
385     rpStartPos = mpActualPos;
386 
387     // Zeichen untersuchen
388     sal_Unicode c = peekChar();
389     if( c == CHAR_EOF )
390         return sal_False;
391 
392     // Zeichen lesen
393     getChar();
394 
395     //*** Alle Moeglichkeiten durchgehen ***
396     // Space?
397     if ( (testCharFlags( c, CHAR_SPACE ) == sal_True) )
398     {
399         while( testCharFlags( peekChar(), CHAR_SPACE ) == sal_True )
400             getChar();
401 
402         reType = TT_WHITESPACE;
403     }
404 
405     // Identifier?
406     else if ( (testCharFlags( c, CHAR_START_IDENTIFIER ) == sal_True) )
407     {
408         sal_Bool bIdentifierChar;
409         do
410         {
411             // Naechstes Zeichen holen
412             c = peekChar();
413             bIdentifierChar = testCharFlags( c, CHAR_IN_IDENTIFIER );
414             if( bIdentifierChar )
415                 getChar();
416         }
417         while( bIdentifierChar );
418 
419         reType = TT_IDENTIFIER;
420 
421         // Schluesselwort-Tabelle
422         if (ppListKeyWords != NULL)
423         {
424             int nCount = mpActualPos - rpStartPos;
425 
426             // No keyword if string contains char > 255
427             bool bCanBeKeyword = true;
428             for( int i = 0 ; i < nCount ; i++ )
429             {
430                 if( rpStartPos[i] > 255 )
431                 {
432                     bCanBeKeyword = false;
433                     break;
434                 }
435             }
436 
437             if( bCanBeKeyword )
438             {
439                 String aKWString(rpStartPos, sal::static_int_cast< xub_StrLen >(nCount) );
440                 ByteString aByteStr( aKWString, RTL_TEXTENCODING_ASCII_US );
441                 aByteStr.ToLowerAscii();
442                 if ( bsearch( aByteStr.GetBuffer(), ppListKeyWords, nKeyWordCount, sizeof( char* ),
443                                                                         compare_strings ) )
444                 {
445                     reType = TT_KEYWORDS;
446 
447                     if ( aByteStr.Equals( "rem" ) )
448                     {
449                         // Alle Zeichen bis Zeilen-Ende oder EOF entfernen
450                         sal_Unicode cPeek = peekChar();
451                         while( cPeek != CHAR_EOF && testCharFlags( cPeek, CHAR_EOL ) == sal_False )
452                         {
453                             c = getChar();
454                             cPeek = peekChar();
455                         }
456 
457                         reType = TT_COMMENT;
458                     }
459                 }
460             }
461         }
462     }
463 
464     // Operator?
465     // only for BASIC '\'' should be a comment, otherwise it is a normal string and handled there
466     else if ( ( testCharFlags( c, CHAR_OPERATOR ) == sal_True ) || ( (c == '\'') && (aLanguage==HIGHLIGHT_BASIC)) )
467     {
468         // paramters for SQL view
469         if ( (c==':') || (c=='?'))
470         {
471             if (c!='?')
472             {
473                 sal_Bool bIdentifierChar;
474                 do
475                 {
476                     // Naechstes Zeichen holen
477                     c = peekChar();
478                     bIdentifierChar =  BasicSimpleCharClass::isAlpha( c, true );
479                     if( bIdentifierChar )
480                         getChar();
481                 }
482                 while( bIdentifierChar );
483             }
484             reType = TT_PARAMETER;
485         }
486         else if( c=='-' )
487         {
488             sal_Unicode cPeekNext = peekChar();
489             if (cPeekNext=='-')
490             {
491                 // Alle Zeichen bis Zeilen-Ende oder EOF entfernen
492                 while( cPeekNext != CHAR_EOF && testCharFlags( cPeekNext, CHAR_EOL ) == sal_False )
493                 {
494                     getChar();
495                     cPeekNext = peekChar();
496                 }
497                 reType = TT_COMMENT;
498             }
499         }
500        else if (c=='/')
501        {
502            sal_Unicode cPeekNext = peekChar();
503            if (cPeekNext=='/')
504            {
505                // Alle Zeichen bis Zeilen-Ende oder EOF entfernen
506                while( cPeekNext != CHAR_EOF && testCharFlags( cPeekNext, CHAR_EOL ) == sal_False )
507                {
508                    getChar();
509                    cPeekNext = peekChar();
510                }
511                reType = TT_COMMENT;
512            }
513        }
514         else
515         {
516             // Kommentar ?
517             if ( c == '\'' )
518             {
519                 c = getChar();  // '/' entfernen
520 
521                 // Alle Zeichen bis Zeilen-Ende oder EOF entfernen
522                 sal_Unicode cPeek = c;
523                 while( cPeek != CHAR_EOF && testCharFlags( cPeek, CHAR_EOL ) == sal_False )
524                 {
525                     getChar();
526                     cPeek = peekChar();
527                 }
528 
529                 reType = TT_COMMENT;
530             }
531 
532             // Echter Operator, kann hier einfach behandelt werden,
533             // da nicht der wirkliche Operator, wie z.B. += interessiert,
534             // sondern nur die Tatsache, dass es sich um einen handelt.
535             if( reType != TT_COMMENT )
536             {
537                 reType = TT_OPERATOR;
538             }
539 
540         }
541     }
542 
543     // Objekt-Trenner? Muss vor Number abgehandelt werden
544     else if( c == '.' && ( peekChar() < '0' || peekChar() > '9' ) )
545     {
546         reType = TT_OPERATOR;
547     }
548 
549     // Zahl?
550     else if( testCharFlags( c, CHAR_START_NUMBER ) == sal_True )
551     {
552         reType = TT_NUMBER;
553 
554         // Zahlensystem, 10 = normal, wird bei Oct/Hex geaendert
555         int nRadix = 10;
556 
557         // Ist es eine Hex- oder Oct-Zahl?
558         if( c == '&' )
559         {
560             // Octal?
561             if( peekChar() == 'o' || peekChar() == 'O' )
562             {
563                 // o entfernen
564                 getChar();
565                 nRadix = 8;     // Octal-Basis
566 
567                 // Alle Ziffern einlesen
568                 while( testCharFlags( peekChar(), CHAR_IN_OCT_NUMBER ) )
569                     c = getChar();
570             }
571             // Hex?
572             else if( peekChar() == 'h' || peekChar() == 'H' )
573             {
574                 // x entfernen
575                 getChar();
576                 nRadix = 16;     // Hex-Basis
577 
578                 // Alle Ziffern einlesen und puffern
579                 while( testCharFlags( peekChar(), CHAR_IN_HEX_NUMBER ) )
580                     c = getChar();
581             }
582             else
583             {
584                 reType = TT_OPERATOR;
585             }
586         }
587 
588         // Wenn nicht Oct oder Hex als double ansehen
589         if( reType == TT_NUMBER && nRadix == 10 )
590         {
591             // Flag, ob das letzte Zeichen ein Exponent war
592             sal_Bool bAfterExpChar = sal_False;
593 
594             // Alle Ziffern einlesen
595             while( testCharFlags( peekChar(), CHAR_IN_NUMBER ) ||
596                     (bAfterExpChar && peekChar() == '+' ) ||
597                     (bAfterExpChar && peekChar() == '-' ) )
598                     // Nach Exponent auch +/- OK
599             {
600                 c = getChar();                  // Zeichen lesen
601                 bAfterExpChar = ( c == 'e' || c == 'E' );
602             }
603         }
604 
605         // reType = TT_NUMBER;
606     }
607 
608     // String?
609     else if( testCharFlags( c, CHAR_START_STRING ) == sal_True )
610     {
611         // Merken, welches Zeichen den String eroeffnet hat
612         sal_Unicode cEndString = c;
613         if( c == '[' )
614             cEndString = ']';
615 
616         // Alle Ziffern einlesen und puffern
617         while( peekChar() != cEndString )
618         {
619             // #58846 EOF vor getChar() abfangen, damit EOF micht verloren geht
620             if( peekChar() == CHAR_EOF )
621             {
622                 // ERROR: unterminated string literal
623                 reType = TT_ERROR;
624                 break;
625             }
626             c = getChar();
627             if( testCharFlags( c, CHAR_EOL ) == sal_True )
628             {
629                 // ERROR: unterminated string literal
630                 reType = TT_ERROR;
631                 break;
632             }
633         }
634 
635         //  Zeichen lesen
636         if( reType != TT_ERROR )
637         {
638             getChar();
639             if( cEndString == ']' )
640                 reType = TT_IDENTIFIER;
641             else
642                 reType = TT_STRING;
643         }
644     }
645 
646     // Zeilenende?
647     else if( testCharFlags( c, CHAR_EOL ) == sal_True )
648     {
649         // Falls ein weiteres anderes EOL-Char folgt, weg damit
650         sal_Unicode cNext = peekChar();
651         if( cNext != c && testCharFlags( cNext, CHAR_EOL ) == sal_True )
652             getChar();
653 
654         // Positions-Daten auf Zeilen-Beginn setzen
655         nCol = 0;
656         nLine++;
657 
658         reType = TT_EOL;
659     }
660 
661     // Alles andere bleibt TT_UNKNOWN
662 
663 
664     // End-Position eintragen
665     rpEndPos = mpActualPos;
666     return sal_True;
667 }
668 
getTokStr(const sal_Unicode * pStartPos,const sal_Unicode * pEndPos)669 String SimpleTokenizer_Impl::getTokStr
670     ( /*out*/const sal_Unicode* pStartPos, /*out*/const sal_Unicode* pEndPos )
671 {
672     return String( pStartPos, (sal_uInt16)( pEndPos - pStartPos ) );
673 }
674 
675 #ifdef DBG_UTIL
676 // TEST: Token ausgeben
getFullTokenStr(TokenTypes eType,const sal_Unicode * pStartPos,const sal_Unicode * pEndPos)677 String SimpleTokenizer_Impl::getFullTokenStr( /*out*/TokenTypes eType,
678     /*out*/const sal_Unicode* pStartPos, /*out*/const sal_Unicode* pEndPos )
679 {
680     String aOut;
681     switch( eType )
682     {
683         case TT_UNKNOWN:    aOut = String( RTL_CONSTASCII_USTRINGPARAM("TT_UNKNOWN:") ); break;
684         case TT_IDENTIFIER: aOut = String( RTL_CONSTASCII_USTRINGPARAM("TT_IDENTIFIER:") ); break;
685         case TT_WHITESPACE: aOut = String( RTL_CONSTASCII_USTRINGPARAM("TT_WHITESPACE:") ); break;
686         case TT_NUMBER:     aOut = String( RTL_CONSTASCII_USTRINGPARAM("TT_NUMBER:") ); break;
687         case TT_STRING:     aOut = String( RTL_CONSTASCII_USTRINGPARAM("TT_STRING:") ); break;
688         case TT_EOL:        aOut = String( RTL_CONSTASCII_USTRINGPARAM("TT_EOL:") ); break;
689         case TT_COMMENT:    aOut = String( RTL_CONSTASCII_USTRINGPARAM("TT_COMMENT:") ); break;
690         case TT_ERROR:      aOut = String( RTL_CONSTASCII_USTRINGPARAM("TT_ERROR:") ); break;
691         case TT_OPERATOR:   aOut = String( RTL_CONSTASCII_USTRINGPARAM("TT_OPERATOR:") ); break;
692         case TT_KEYWORDS:   aOut = String( RTL_CONSTASCII_USTRINGPARAM("TT_KEYWORD:") ); break;
693         case TT_PARAMETER:  aOut = String( RTL_CONSTASCII_USTRINGPARAM("TT_PARAMETER:") ); break;
694     }
695     if( eType != TT_EOL )
696     {
697         aOut += String( pStartPos, (sal_uInt16)( pEndPos - pStartPos ) );
698     }
699     aOut += String( RTL_CONSTASCII_USTRINGPARAM("\n") );
700     return aOut;
701 }
702 #endif
703 
SimpleTokenizer_Impl(HighlighterLanguage aLang)704 SimpleTokenizer_Impl::SimpleTokenizer_Impl( HighlighterLanguage aLang ): aLanguage(aLang)
705 {
706     memset( aCharTypeTab, 0, sizeof( aCharTypeTab ) );
707 
708     // Zeichen-Tabelle fuellen
709     sal_uInt16 i;
710 
711     // Zulaessige Zeichen fuer Identifier
712     sal_uInt16 nHelpMask = (sal_uInt16)( CHAR_START_IDENTIFIER | CHAR_IN_IDENTIFIER );
713     for( i = 'a' ; i <= 'z' ; i++ )
714         aCharTypeTab[i] |= nHelpMask;
715     for( i = 'A' ; i <= 'Z' ; i++ )
716         aCharTypeTab[i] |= nHelpMask;
717     // '_' extra eintragen
718     aCharTypeTab[(int)'_'] |= nHelpMask;
719     // AB 23.6.97: '$' ist auch erlaubt
720     aCharTypeTab[(int)'$'] |= nHelpMask;
721 
722     // Ziffern (Identifier und Number ist moeglich)
723     nHelpMask = (sal_uInt16)( CHAR_IN_IDENTIFIER | CHAR_START_NUMBER |
724                          CHAR_IN_NUMBER | CHAR_IN_HEX_NUMBER );
725     for( i = '0' ; i <= '9' ; i++ )
726         aCharTypeTab[i] |= nHelpMask;
727 
728     // e und E sowie . von Hand ergaenzen
729     aCharTypeTab[(int)'e'] |= CHAR_IN_NUMBER;
730     aCharTypeTab[(int)'E'] |= CHAR_IN_NUMBER;
731     aCharTypeTab[(int)'.'] |= (sal_uInt16)( CHAR_IN_NUMBER | CHAR_START_NUMBER );
732     aCharTypeTab[(int)'&'] |= CHAR_START_NUMBER;
733 
734     // Hex-Ziffern
735     for( i = 'a' ; i <= 'f' ; i++ )
736         aCharTypeTab[i] |= CHAR_IN_HEX_NUMBER;
737     for( i = 'A' ; i <= 'F' ; i++ )
738         aCharTypeTab[i] |= CHAR_IN_HEX_NUMBER;
739 
740     // Oct-Ziffern
741     for( i = '0' ; i <= '7' ; i++ )
742         aCharTypeTab[i] |= CHAR_IN_OCT_NUMBER;
743 
744     // String-Beginn/End-Zeichen
745     aCharTypeTab[(int)'\''] |= CHAR_START_STRING;
746     aCharTypeTab[(int)'\"'] |= CHAR_START_STRING;
747     aCharTypeTab[(int)'[']  |= CHAR_START_STRING;
748     aCharTypeTab[(int)'`']  |= CHAR_START_STRING;
749 
750     // Operator-Zeichen
751     aCharTypeTab[(int)'!'] |= CHAR_OPERATOR;
752     aCharTypeTab[(int)'%'] |= CHAR_OPERATOR;
753     // aCharTypeTab[(int)'&'] |= CHAR_OPERATOR;     Removed because of #i14140
754     aCharTypeTab[(int)'('] |= CHAR_OPERATOR;
755     aCharTypeTab[(int)')'] |= CHAR_OPERATOR;
756     aCharTypeTab[(int)'*'] |= CHAR_OPERATOR;
757     aCharTypeTab[(int)'+'] |= CHAR_OPERATOR;
758     aCharTypeTab[(int)','] |= CHAR_OPERATOR;
759     aCharTypeTab[(int)'-'] |= CHAR_OPERATOR;
760     aCharTypeTab[(int)'/'] |= CHAR_OPERATOR;
761     aCharTypeTab[(int)':'] |= CHAR_OPERATOR;
762     aCharTypeTab[(int)'<'] |= CHAR_OPERATOR;
763     aCharTypeTab[(int)'='] |= CHAR_OPERATOR;
764     aCharTypeTab[(int)'>'] |= CHAR_OPERATOR;
765     aCharTypeTab[(int)'?'] |= CHAR_OPERATOR;
766     aCharTypeTab[(int)'^'] |= CHAR_OPERATOR;
767     aCharTypeTab[(int)'|'] |= CHAR_OPERATOR;
768     aCharTypeTab[(int)'~'] |= CHAR_OPERATOR;
769     aCharTypeTab[(int)'{'] |= CHAR_OPERATOR;
770     aCharTypeTab[(int)'}'] |= CHAR_OPERATOR;
771     // aCharTypeTab[(int)'['] |= CHAR_OPERATOR;     Removed because of #i17826
772     aCharTypeTab[(int)']'] |= CHAR_OPERATOR;
773     aCharTypeTab[(int)';'] |= CHAR_OPERATOR;
774 
775     // Space
776     aCharTypeTab[(int)' ' ] |= CHAR_SPACE;
777     aCharTypeTab[(int)'\t'] |= CHAR_SPACE;
778 
779     // Zeilen-Ende-Zeichen
780     aCharTypeTab[(int)'\r'] |= CHAR_EOL;
781     aCharTypeTab[(int)'\n'] |= CHAR_EOL;
782 
783     ppListKeyWords = NULL;
784 }
785 
~SimpleTokenizer_Impl(void)786 SimpleTokenizer_Impl::~SimpleTokenizer_Impl( void )
787 {
788 }
789 
getSimpleTokenizer(void)790 SimpleTokenizer_Impl* getSimpleTokenizer( void )
791 {
792     static SimpleTokenizer_Impl* pSimpleTokenizer = NULL;
793     if( !pSimpleTokenizer )
794         pSimpleTokenizer = new SimpleTokenizer_Impl();
795     return pSimpleTokenizer;
796 }
797 
798 // Heraussuchen der jeweils naechsten Funktion aus einem JavaScript-Modul
parseLine(sal_uInt32 nParseLine,const String * aSource)799 sal_uInt16 SimpleTokenizer_Impl::parseLine( sal_uInt32 nParseLine, const String* aSource )
800 {
801     // Position auf den Anfang des Source-Strings setzen
802     mpStringBegin = mpActualPos = aSource->GetBuffer();
803 
804     // Zeile und Spalte initialisieren
805     nLine = nParseLine;
806     nCol = 0L;
807 
808     // Variablen fuer die Out-Parameter
809     TokenTypes eType;
810     const sal_Unicode* pStartPos;
811     const sal_Unicode* pEndPos;
812 
813     // Schleife ueber alle Tokens
814     sal_uInt16 nTokenCount = 0;
815     while( getNextToken( eType, pStartPos, pEndPos ) )
816         nTokenCount++;
817 
818     return nTokenCount;
819 }
820 
getHighlightPortions(sal_uInt32 nParseLine,const String & rLine,HighlightPortions & portions)821 void SimpleTokenizer_Impl::getHighlightPortions( sal_uInt32 nParseLine, const String& rLine,
822                                                     /*out*/HighlightPortions& portions  )
823 {
824     // Position auf den Anfang des Source-Strings setzen
825     mpStringBegin = mpActualPos = rLine.GetBuffer();
826 
827     // Zeile und Spalte initialisieren
828     nLine = nParseLine;
829     nCol = 0L;
830 
831     // Variablen fuer die Out-Parameter
832     TokenTypes eType;
833     const sal_Unicode* pStartPos;
834     const sal_Unicode* pEndPos;
835 
836     // Schleife ueber alle Tokens
837     while( getNextToken( eType, pStartPos, pEndPos ) )
838     {
839         HighlightPortion portion;
840 
841         portion.nBegin = (sal_uInt16)(pStartPos - mpStringBegin);
842         portion.nEnd = (sal_uInt16)(pEndPos - mpStringBegin);
843         portion.tokenType = eType;
844 
845         portions.push_back(portion);
846     }
847 }
848 
849 
850 //////////////////////////////////////////////////////////////////////////
851 // Implementierung des SyntaxHighlighter
852 
SyntaxHighlighter()853 SyntaxHighlighter::SyntaxHighlighter()
854 {
855     m_pSimpleTokenizer = 0;
856     m_pKeyWords = NULL;
857     m_nKeyWordCount = 0;
858 }
859 
~SyntaxHighlighter()860 SyntaxHighlighter::~SyntaxHighlighter()
861 {
862     delete m_pSimpleTokenizer;
863     delete m_pKeyWords;
864 }
865 
initialize(HighlighterLanguage eLanguage_)866 void SyntaxHighlighter::initialize( HighlighterLanguage eLanguage_ )
867 {
868     eLanguage = eLanguage_;
869     delete m_pSimpleTokenizer;
870     m_pSimpleTokenizer = new SimpleTokenizer_Impl(eLanguage);
871 
872     switch (eLanguage)
873     {
874         case HIGHLIGHT_BASIC:
875             m_pSimpleTokenizer->setKeyWords( strListBasicKeyWords,
876                                             sizeof( strListBasicKeyWords ) / sizeof( char* ));
877             break;
878         case HIGHLIGHT_SQL:
879             m_pSimpleTokenizer->setKeyWords( strListSqlKeyWords,
880                                             sizeof( strListSqlKeyWords ) / sizeof( char* ));
881             break;
882         default:
883             m_pSimpleTokenizer->setKeyWords( NULL, 0 );
884     }
885 }
886 
notifyChange(sal_uInt32 nLine,sal_Int32 nLineCountDifference,const String * pChangedLines,sal_uInt32 nArrayLength)887 const Range SyntaxHighlighter::notifyChange( sal_uInt32 nLine, sal_Int32 nLineCountDifference,
888                                 const String* pChangedLines, sal_uInt32 nArrayLength)
889 {
890     (void)nLineCountDifference;
891 
892     for( sal_uInt32 i=0 ; i < nArrayLength ; i++ )
893         m_pSimpleTokenizer->parseLine(nLine+i, &pChangedLines[i]);
894 
895     return Range( nLine, nLine + nArrayLength-1 );
896 }
897 
getHighlightPortions(sal_uInt32 nLine,const String & rLine,HighlightPortions & portions)898 void SyntaxHighlighter::getHighlightPortions( sal_uInt32 nLine, const String& rLine,
899                                             /*out*/HighlightPortions& portions )
900 {
901     m_pSimpleTokenizer->getHighlightPortions( nLine, rLine, portions );
902 }
903