xref: /AOO41X/main/i18npool/source/characterclassification/cclass_unicode_parser.cxx (revision 449ab281255486d6ec349c45a6ad7906d6939331)
1 /**************************************************************
2  *
3  * Licensed to the Apache Software Foundation (ASF) under one
4  * or more contributor license agreements.  See the NOTICE file
5  * distributed with this work for additional information
6  * regarding copyright ownership.  The ASF licenses this file
7  * to you under the Apache License, Version 2.0 (the
8  * "License"); you may not use this file except in compliance
9  * with the License.  You may obtain a copy of the License at
10  *
11  *   http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing,
14  * software distributed under the License is distributed on an
15  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16  * KIND, either express or implied.  See the License for the
17  * specific language governing permissions and limitations
18  * under the License.
19  *
20  *************************************************************/
21 
22 
23 
24 // MARKER(update_precomp.py): autogen include statement, do not remove
25 #include "precompiled_i18npool.hxx"
26 
27 #include <cclass_unicode.hxx>
28 #include <unicode/uchar.h>
29 #include <rtl/math.hxx>
30 #include <rtl/ustring.hxx>
31 #include <com/sun/star/i18n/KParseTokens.hpp>
32 #include <com/sun/star/i18n/KParseType.hpp>
33 #include <com/sun/star/i18n/UnicodeType.hpp>
34 #include <com/sun/star/i18n/XLocaleData.hpp>
35 #include <com/sun/star/i18n/NativeNumberMode.hpp>
36 
37 #include <string.h>     // memcpy()
38 
39 using namespace ::com::sun::star::uno;
40 using namespace ::com::sun::star::lang;
41 using namespace ::rtl;
42 
43 namespace com { namespace sun { namespace star { namespace i18n {
44 
45 const UPT_FLAG_TYPE cclass_Unicode::TOKEN_ILLEGAL       = 0x00000000;
46 const UPT_FLAG_TYPE cclass_Unicode::TOKEN_CHAR          = 0x00000001;
47 const UPT_FLAG_TYPE cclass_Unicode::TOKEN_CHAR_BOOL = 0x00000002;
48 const UPT_FLAG_TYPE cclass_Unicode::TOKEN_CHAR_WORD = 0x00000004;
49 const UPT_FLAG_TYPE cclass_Unicode::TOKEN_CHAR_VALUE    = 0x00000008;
50 const UPT_FLAG_TYPE cclass_Unicode::TOKEN_CHAR_STRING   = 0x00000010;
51 const UPT_FLAG_TYPE cclass_Unicode::TOKEN_CHAR_DONTCARE= 0x00000020;
52 const UPT_FLAG_TYPE cclass_Unicode::TOKEN_BOOL          = 0x00000040;
53 const UPT_FLAG_TYPE cclass_Unicode::TOKEN_WORD          = 0x00000080;
54 const UPT_FLAG_TYPE cclass_Unicode::TOKEN_WORD_SEP      = 0x00000100;
55 const UPT_FLAG_TYPE cclass_Unicode::TOKEN_VALUE     = 0x00000200;
56 const UPT_FLAG_TYPE cclass_Unicode::TOKEN_VALUE_SEP = 0x00000400;
57 const UPT_FLAG_TYPE cclass_Unicode::TOKEN_VALUE_EXP = 0x00000800;
58 const UPT_FLAG_TYPE cclass_Unicode::TOKEN_VALUE_SIGN    = 0x00001000;
59 const UPT_FLAG_TYPE cclass_Unicode::TOKEN_VALUE_EXP_VALUE   = 0x00002000;
60 const UPT_FLAG_TYPE cclass_Unicode::TOKEN_VALUE_DIGIT   = 0x00004000;
61 const UPT_FLAG_TYPE cclass_Unicode::TOKEN_NAME_SEP      = 0x20000000;
62 const UPT_FLAG_TYPE cclass_Unicode::TOKEN_STRING_SEP    = 0x40000000;
63 const UPT_FLAG_TYPE cclass_Unicode::TOKEN_EXCLUDED      = 0x80000000;
64 
65 #define TOKEN_DIGIT_FLAGS (TOKEN_CHAR_VALUE | TOKEN_VALUE | TOKEN_VALUE_EXP | TOKEN_VALUE_EXP_VALUE | TOKEN_VALUE_DIGIT)
66 
67 // Default identifier/name specification is [A-Za-z_][A-Za-z0-9_]*
68 
69 const sal_uInt8 cclass_Unicode::nDefCnt = 128;
70 const UPT_FLAG_TYPE cclass_Unicode::pDefaultParserTable[ nDefCnt ] =
71 {
72 // (...) == Calc formula compiler specific, commented out and modified
73 
74     /* \0 */    TOKEN_EXCLUDED,
75                 TOKEN_ILLEGAL,
76                 TOKEN_ILLEGAL,
77                 TOKEN_ILLEGAL,
78                 TOKEN_ILLEGAL,
79                 TOKEN_ILLEGAL,
80                 TOKEN_ILLEGAL,
81                 TOKEN_ILLEGAL,
82                 TOKEN_ILLEGAL,
83     /*  9 \t */ TOKEN_CHAR_DONTCARE | TOKEN_WORD_SEP | TOKEN_VALUE_SEP,     // (TOKEN_ILLEGAL)
84                 TOKEN_ILLEGAL,
85     /* 11 \v */ TOKEN_CHAR_DONTCARE | TOKEN_WORD_SEP | TOKEN_VALUE_SEP,     // (TOKEN_ILLEGAL)
86                 TOKEN_ILLEGAL,
87                 TOKEN_ILLEGAL,
88                 TOKEN_ILLEGAL,
89                 TOKEN_ILLEGAL,
90                 TOKEN_ILLEGAL,
91                 TOKEN_ILLEGAL,
92                 TOKEN_ILLEGAL,
93                 TOKEN_ILLEGAL,
94                 TOKEN_ILLEGAL,
95                 TOKEN_ILLEGAL,
96                 TOKEN_ILLEGAL,
97                 TOKEN_ILLEGAL,
98                 TOKEN_ILLEGAL,
99                 TOKEN_ILLEGAL,
100                 TOKEN_ILLEGAL,
101                 TOKEN_ILLEGAL,
102                 TOKEN_ILLEGAL,
103                 TOKEN_ILLEGAL,
104                 TOKEN_ILLEGAL,
105                 TOKEN_ILLEGAL,
106     /*  32   */ TOKEN_CHAR_DONTCARE | TOKEN_WORD_SEP | TOKEN_VALUE_SEP,
107     /*  33 ! */ TOKEN_CHAR | TOKEN_WORD_SEP | TOKEN_VALUE_SEP,
108     /*  34 " */ TOKEN_CHAR_STRING | TOKEN_STRING_SEP,
109     /*  35 # */ TOKEN_CHAR | TOKEN_WORD_SEP | TOKEN_VALUE_SEP,  // (TOKEN_WORD_SEP)
110     /*  36 $ */ TOKEN_CHAR | TOKEN_WORD_SEP | TOKEN_VALUE_SEP,  // (TOKEN_CHAR_WORD | TOKEN_WORD)
111     /*  37 % */ TOKEN_CHAR | TOKEN_WORD_SEP | TOKEN_VALUE_SEP,  // (TOKEN_VALUE)
112     /*  38 & */ TOKEN_CHAR | TOKEN_WORD_SEP | TOKEN_VALUE_SEP,
113     /*  39 ' */ TOKEN_NAME_SEP,
114     /*  40 ( */ TOKEN_CHAR | TOKEN_WORD_SEP | TOKEN_VALUE_SEP,
115     /*  41 ) */ TOKEN_CHAR | TOKEN_WORD_SEP | TOKEN_VALUE_SEP,
116     /*  42 * */ TOKEN_CHAR | TOKEN_WORD_SEP | TOKEN_VALUE_SEP,
117     /*  43 + */ TOKEN_CHAR | TOKEN_WORD_SEP | TOKEN_VALUE_SEP | TOKEN_VALUE_EXP | TOKEN_VALUE_SIGN,
118     /*  44 , */ TOKEN_CHAR | TOKEN_WORD_SEP | TOKEN_VALUE_SEP,  // (TOKEN_CHAR_VALUE | TOKEN_VALUE)
119     /*  45 - */ TOKEN_CHAR | TOKEN_WORD_SEP | TOKEN_VALUE_SEP | TOKEN_VALUE_EXP | TOKEN_VALUE_SIGN,
120     /*  46 . */ TOKEN_CHAR | TOKEN_WORD_SEP | TOKEN_VALUE_SEP,  // (TOKEN_WORD | TOKEN_CHAR_VALUE | TOKEN_VALUE)
121     /*  47 / */ TOKEN_CHAR | TOKEN_WORD_SEP | TOKEN_VALUE_SEP,
122     //for ( i = 48; i < 58; i++ )
123     /*  48 0 */ TOKEN_DIGIT_FLAGS | TOKEN_WORD,
124     /*  49 1 */ TOKEN_DIGIT_FLAGS | TOKEN_WORD,
125     /*  50 2 */ TOKEN_DIGIT_FLAGS | TOKEN_WORD,
126     /*  51 3 */ TOKEN_DIGIT_FLAGS | TOKEN_WORD,
127     /*  52 4 */ TOKEN_DIGIT_FLAGS | TOKEN_WORD,
128     /*  53 5 */ TOKEN_DIGIT_FLAGS | TOKEN_WORD,
129     /*  54 6 */ TOKEN_DIGIT_FLAGS | TOKEN_WORD,
130     /*  55 7 */ TOKEN_DIGIT_FLAGS | TOKEN_WORD,
131     /*  56 8 */ TOKEN_DIGIT_FLAGS | TOKEN_WORD,
132     /*  57 9 */ TOKEN_DIGIT_FLAGS | TOKEN_WORD,
133     /*  58 : */ TOKEN_CHAR | TOKEN_WORD_SEP | TOKEN_VALUE_SEP,  // (TOKEN_WORD)
134     /*  59 ; */ TOKEN_CHAR | TOKEN_WORD_SEP | TOKEN_VALUE_SEP,
135     /*  60 < */ TOKEN_CHAR_BOOL | TOKEN_WORD_SEP | TOKEN_VALUE_SEP,
136     /*  61 = */ TOKEN_CHAR | TOKEN_BOOL | TOKEN_WORD_SEP | TOKEN_VALUE_SEP,
137     /*  62 > */ TOKEN_CHAR_BOOL | TOKEN_BOOL | TOKEN_WORD_SEP | TOKEN_VALUE_SEP,
138     /*  63 ? */ TOKEN_CHAR | TOKEN_WORD_SEP | TOKEN_VALUE_SEP,  // (TOKEN_CHAR_WORD | TOKEN_WORD)
139     /*  64 @ */ TOKEN_CHAR | TOKEN_WORD_SEP | TOKEN_VALUE_SEP,  // (TOKEN_ILLEGAL // UNUSED)
140     //for ( i = 65; i < 91; i++ )
141     /*  65 A */ TOKEN_CHAR_WORD | TOKEN_WORD,
142     /*  66 B */ TOKEN_CHAR_WORD | TOKEN_WORD,
143     /*  67 C */ TOKEN_CHAR_WORD | TOKEN_WORD,
144     /*  68 D */ TOKEN_CHAR_WORD | TOKEN_WORD,
145     /*  69 E */ TOKEN_CHAR_WORD | TOKEN_WORD,
146     /*  70 F */ TOKEN_CHAR_WORD | TOKEN_WORD,
147     /*  71 G */ TOKEN_CHAR_WORD | TOKEN_WORD,
148     /*  72 H */ TOKEN_CHAR_WORD | TOKEN_WORD,
149     /*  73 I */ TOKEN_CHAR_WORD | TOKEN_WORD,
150     /*  74 J */ TOKEN_CHAR_WORD | TOKEN_WORD,
151     /*  75 K */ TOKEN_CHAR_WORD | TOKEN_WORD,
152     /*  76 L */ TOKEN_CHAR_WORD | TOKEN_WORD,
153     /*  77 M */ TOKEN_CHAR_WORD | TOKEN_WORD,
154     /*  78 N */ TOKEN_CHAR_WORD | TOKEN_WORD,
155     /*  79 O */ TOKEN_CHAR_WORD | TOKEN_WORD,
156     /*  80 P */ TOKEN_CHAR_WORD | TOKEN_WORD,
157     /*  81 Q */ TOKEN_CHAR_WORD | TOKEN_WORD,
158     /*  82 R */ TOKEN_CHAR_WORD | TOKEN_WORD,
159     /*  83 S */ TOKEN_CHAR_WORD | TOKEN_WORD,
160     /*  84 T */ TOKEN_CHAR_WORD | TOKEN_WORD,
161     /*  85 U */ TOKEN_CHAR_WORD | TOKEN_WORD,
162     /*  86 V */ TOKEN_CHAR_WORD | TOKEN_WORD,
163     /*  87 W */ TOKEN_CHAR_WORD | TOKEN_WORD,
164     /*  88 X */ TOKEN_CHAR_WORD | TOKEN_WORD,
165     /*  89 Y */ TOKEN_CHAR_WORD | TOKEN_WORD,
166     /*  90 Z */ TOKEN_CHAR_WORD | TOKEN_WORD,
167     /*  91 [ */ TOKEN_CHAR | TOKEN_WORD_SEP | TOKEN_VALUE_SEP,  // (TOKEN_ILLEGAL // UNUSED)
168     /*  92 \ */ TOKEN_CHAR | TOKEN_WORD_SEP | TOKEN_VALUE_SEP,  // (TOKEN_ILLEGAL // UNUSED)
169     /*  93 ] */ TOKEN_CHAR | TOKEN_WORD_SEP | TOKEN_VALUE_SEP,  // (TOKEN_ILLEGAL // UNUSED)
170     /*  94 ^ */ TOKEN_CHAR | TOKEN_WORD_SEP | TOKEN_VALUE_SEP,
171     /*  95 _ */ TOKEN_CHAR_WORD | TOKEN_WORD,
172     /*  96 ` */ TOKEN_CHAR | TOKEN_WORD_SEP | TOKEN_VALUE_SEP,  // (TOKEN_ILLEGAL // UNUSED)
173     //for ( i = 97; i < 123; i++ )
174     /*  97 a */ TOKEN_CHAR_WORD | TOKEN_WORD,
175     /*  98 b */ TOKEN_CHAR_WORD | TOKEN_WORD,
176     /*  99 c */ TOKEN_CHAR_WORD | TOKEN_WORD,
177     /* 100 d */ TOKEN_CHAR_WORD | TOKEN_WORD,
178     /* 101 e */ TOKEN_CHAR_WORD | TOKEN_WORD,
179     /* 102 f */ TOKEN_CHAR_WORD | TOKEN_WORD,
180     /* 103 g */ TOKEN_CHAR_WORD | TOKEN_WORD,
181     /* 104 h */ TOKEN_CHAR_WORD | TOKEN_WORD,
182     /* 105 i */ TOKEN_CHAR_WORD | TOKEN_WORD,
183     /* 106 j */ TOKEN_CHAR_WORD | TOKEN_WORD,
184     /* 107 k */ TOKEN_CHAR_WORD | TOKEN_WORD,
185     /* 108 l */ TOKEN_CHAR_WORD | TOKEN_WORD,
186     /* 109 m */ TOKEN_CHAR_WORD | TOKEN_WORD,
187     /* 110 n */ TOKEN_CHAR_WORD | TOKEN_WORD,
188     /* 111 o */ TOKEN_CHAR_WORD | TOKEN_WORD,
189     /* 112 p */ TOKEN_CHAR_WORD | TOKEN_WORD,
190     /* 113 q */ TOKEN_CHAR_WORD | TOKEN_WORD,
191     /* 114 r */ TOKEN_CHAR_WORD | TOKEN_WORD,
192     /* 115 s */ TOKEN_CHAR_WORD | TOKEN_WORD,
193     /* 116 t */ TOKEN_CHAR_WORD | TOKEN_WORD,
194     /* 117 u */ TOKEN_CHAR_WORD | TOKEN_WORD,
195     /* 118 v */ TOKEN_CHAR_WORD | TOKEN_WORD,
196     /* 119 w */ TOKEN_CHAR_WORD | TOKEN_WORD,
197     /* 120 x */ TOKEN_CHAR_WORD | TOKEN_WORD,
198     /* 121 y */ TOKEN_CHAR_WORD | TOKEN_WORD,
199     /* 122 z */ TOKEN_CHAR_WORD | TOKEN_WORD,
200     /* 123 { */ TOKEN_CHAR | TOKEN_WORD_SEP | TOKEN_VALUE_SEP,  // (TOKEN_ILLEGAL // UNUSED)
201     /* 124 | */ TOKEN_CHAR | TOKEN_WORD_SEP | TOKEN_VALUE_SEP,  // (TOKEN_ILLEGAL // UNUSED)
202     /* 125 } */ TOKEN_CHAR | TOKEN_WORD_SEP | TOKEN_VALUE_SEP,  // (TOKEN_ILLEGAL // UNUSED)
203     /* 126 ~ */ TOKEN_CHAR | TOKEN_WORD_SEP | TOKEN_VALUE_SEP,  // (TOKEN_ILLEGAL // UNUSED)
204     /* 127   */ TOKEN_CHAR | TOKEN_WORD_SEP | TOKEN_VALUE_SEP   // (TOKEN_ILLEGAL // UNUSED)
205 };
206 
207 
208 const sal_Int32 cclass_Unicode::pParseTokensType[ nDefCnt ] =
209 {
210     /* \0 */    KParseTokens::ASC_OTHER,
211                 KParseTokens::ASC_CONTROL,
212                 KParseTokens::ASC_CONTROL,
213                 KParseTokens::ASC_CONTROL,
214                 KParseTokens::ASC_CONTROL,
215                 KParseTokens::ASC_CONTROL,
216                 KParseTokens::ASC_CONTROL,
217                 KParseTokens::ASC_CONTROL,
218                 KParseTokens::ASC_CONTROL,
219     /*  9 \t */ KParseTokens::ASC_CONTROL,
220                 KParseTokens::ASC_CONTROL,
221     /* 11 \v */ KParseTokens::ASC_CONTROL,
222                 KParseTokens::ASC_CONTROL,
223                 KParseTokens::ASC_CONTROL,
224                 KParseTokens::ASC_CONTROL,
225                 KParseTokens::ASC_CONTROL,
226                 KParseTokens::ASC_CONTROL,
227                 KParseTokens::ASC_CONTROL,
228                 KParseTokens::ASC_CONTROL,
229                 KParseTokens::ASC_CONTROL,
230                 KParseTokens::ASC_CONTROL,
231                 KParseTokens::ASC_CONTROL,
232                 KParseTokens::ASC_CONTROL,
233                 KParseTokens::ASC_CONTROL,
234                 KParseTokens::ASC_CONTROL,
235                 KParseTokens::ASC_CONTROL,
236                 KParseTokens::ASC_CONTROL,
237                 KParseTokens::ASC_CONTROL,
238                 KParseTokens::ASC_CONTROL,
239                 KParseTokens::ASC_CONTROL,
240                 KParseTokens::ASC_CONTROL,
241                 KParseTokens::ASC_CONTROL,
242     /*  32   */ KParseTokens::ASC_OTHER,
243     /*  33 ! */ KParseTokens::ASC_OTHER,
244     /*  34 " */ KParseTokens::ASC_OTHER,
245     /*  35 # */ KParseTokens::ASC_OTHER,
246     /*  36 $ */ KParseTokens::ASC_DOLLAR,
247     /*  37 % */ KParseTokens::ASC_OTHER,
248     /*  38 & */ KParseTokens::ASC_OTHER,
249     /*  39 ' */ KParseTokens::ASC_OTHER,
250     /*  40 ( */ KParseTokens::ASC_OTHER,
251     /*  41 ) */ KParseTokens::ASC_OTHER,
252     /*  42 * */ KParseTokens::ASC_OTHER,
253     /*  43 + */ KParseTokens::ASC_OTHER,
254     /*  44 , */ KParseTokens::ASC_OTHER,
255     /*  45 - */ KParseTokens::ASC_OTHER,
256     /*  46 . */ KParseTokens::ASC_DOT,
257     /*  47 / */ KParseTokens::ASC_OTHER,
258     //for ( i = 48; i < 58; i++ )
259     /*  48 0 */ KParseTokens::ASC_DIGIT,
260     /*  49 1 */ KParseTokens::ASC_DIGIT,
261     /*  50 2 */ KParseTokens::ASC_DIGIT,
262     /*  51 3 */ KParseTokens::ASC_DIGIT,
263     /*  52 4 */ KParseTokens::ASC_DIGIT,
264     /*  53 5 */ KParseTokens::ASC_DIGIT,
265     /*  54 6 */ KParseTokens::ASC_DIGIT,
266     /*  55 7 */ KParseTokens::ASC_DIGIT,
267     /*  56 8 */ KParseTokens::ASC_DIGIT,
268     /*  57 9 */ KParseTokens::ASC_DIGIT,
269     /*  58 : */ KParseTokens::ASC_COLON,
270     /*  59 ; */ KParseTokens::ASC_OTHER,
271     /*  60 < */ KParseTokens::ASC_OTHER,
272     /*  61 = */ KParseTokens::ASC_OTHER,
273     /*  62 > */ KParseTokens::ASC_OTHER,
274     /*  63 ? */ KParseTokens::ASC_OTHER,
275     /*  64 @ */ KParseTokens::ASC_OTHER,
276     //for ( i = 65; i < 91; i++ )
277     /*  65 A */ KParseTokens::ASC_UPALPHA,
278     /*  66 B */ KParseTokens::ASC_UPALPHA,
279     /*  67 C */ KParseTokens::ASC_UPALPHA,
280     /*  68 D */ KParseTokens::ASC_UPALPHA,
281     /*  69 E */ KParseTokens::ASC_UPALPHA,
282     /*  70 F */ KParseTokens::ASC_UPALPHA,
283     /*  71 G */ KParseTokens::ASC_UPALPHA,
284     /*  72 H */ KParseTokens::ASC_UPALPHA,
285     /*  73 I */ KParseTokens::ASC_UPALPHA,
286     /*  74 J */ KParseTokens::ASC_UPALPHA,
287     /*  75 K */ KParseTokens::ASC_UPALPHA,
288     /*  76 L */ KParseTokens::ASC_UPALPHA,
289     /*  77 M */ KParseTokens::ASC_UPALPHA,
290     /*  78 N */ KParseTokens::ASC_UPALPHA,
291     /*  79 O */ KParseTokens::ASC_UPALPHA,
292     /*  80 P */ KParseTokens::ASC_UPALPHA,
293     /*  81 Q */ KParseTokens::ASC_UPALPHA,
294     /*  82 R */ KParseTokens::ASC_UPALPHA,
295     /*  83 S */ KParseTokens::ASC_UPALPHA,
296     /*  84 T */ KParseTokens::ASC_UPALPHA,
297     /*  85 U */ KParseTokens::ASC_UPALPHA,
298     /*  86 V */ KParseTokens::ASC_UPALPHA,
299     /*  87 W */ KParseTokens::ASC_UPALPHA,
300     /*  88 X */ KParseTokens::ASC_UPALPHA,
301     /*  89 Y */ KParseTokens::ASC_UPALPHA,
302     /*  90 Z */ KParseTokens::ASC_UPALPHA,
303     /*  91 [ */ KParseTokens::ASC_OTHER,
304     /*  92 \ */ KParseTokens::ASC_OTHER,
305     /*  93 ] */ KParseTokens::ASC_OTHER,
306     /*  94 ^ */ KParseTokens::ASC_OTHER,
307     /*  95 _ */ KParseTokens::ASC_UNDERSCORE,
308     /*  96 ` */ KParseTokens::ASC_OTHER,
309     //for ( i = 97; i < 123; i++ )
310     /*  97 a */ KParseTokens::ASC_LOALPHA,
311     /*  98 b */ KParseTokens::ASC_LOALPHA,
312     /*  99 c */ KParseTokens::ASC_LOALPHA,
313     /* 100 d */ KParseTokens::ASC_LOALPHA,
314     /* 101 e */ KParseTokens::ASC_LOALPHA,
315     /* 102 f */ KParseTokens::ASC_LOALPHA,
316     /* 103 g */ KParseTokens::ASC_LOALPHA,
317     /* 104 h */ KParseTokens::ASC_LOALPHA,
318     /* 105 i */ KParseTokens::ASC_LOALPHA,
319     /* 106 j */ KParseTokens::ASC_LOALPHA,
320     /* 107 k */ KParseTokens::ASC_LOALPHA,
321     /* 108 l */ KParseTokens::ASC_LOALPHA,
322     /* 109 m */ KParseTokens::ASC_LOALPHA,
323     /* 110 n */ KParseTokens::ASC_LOALPHA,
324     /* 111 o */ KParseTokens::ASC_LOALPHA,
325     /* 112 p */ KParseTokens::ASC_LOALPHA,
326     /* 113 q */ KParseTokens::ASC_LOALPHA,
327     /* 114 r */ KParseTokens::ASC_LOALPHA,
328     /* 115 s */ KParseTokens::ASC_LOALPHA,
329     /* 116 t */ KParseTokens::ASC_LOALPHA,
330     /* 117 u */ KParseTokens::ASC_LOALPHA,
331     /* 118 v */ KParseTokens::ASC_LOALPHA,
332     /* 119 w */ KParseTokens::ASC_LOALPHA,
333     /* 120 x */ KParseTokens::ASC_LOALPHA,
334     /* 121 y */ KParseTokens::ASC_LOALPHA,
335     /* 122 z */ KParseTokens::ASC_LOALPHA,
336     /* 123 { */ KParseTokens::ASC_OTHER,
337     /* 124 | */ KParseTokens::ASC_OTHER,
338     /* 125 } */ KParseTokens::ASC_OTHER,
339     /* 126 ~ */ KParseTokens::ASC_OTHER,
340     /* 127   */ KParseTokens::ASC_OTHER
341 };
342 
343 
344 // static
StrChr(const sal_Unicode * pStr,sal_Unicode c)345 const sal_Unicode* cclass_Unicode::StrChr( const sal_Unicode* pStr, sal_Unicode c )
346 {
347     if ( !pStr )
348         return NULL;
349     while ( *pStr )
350     {
351         if ( *pStr == c )
352             return pStr;
353         pStr++;
354     }
355     return NULL;
356 }
357 
358 
getParseTokensType(const sal_Unicode * aStr,sal_Int32 nPos)359 sal_Int32 cclass_Unicode::getParseTokensType( const sal_Unicode* aStr, sal_Int32 nPos )
360 {
361     sal_Unicode c = aStr[nPos];
362     if ( c < nDefCnt )
363         return pParseTokensType[ sal_uInt8(c) ];
364     else
365     {
366 
367         //! all KParseTokens::UNI_... must be matched
368         switch ( u_charType( (sal_uInt32) c ) )
369         {
370             case U_UPPERCASE_LETTER :
371                 return KParseTokens::UNI_UPALPHA;
372             case U_LOWERCASE_LETTER :
373                 return KParseTokens::UNI_LOALPHA;
374             case U_TITLECASE_LETTER :
375                 return KParseTokens::UNI_TITLE_ALPHA;
376             case U_MODIFIER_LETTER :
377                 return KParseTokens::UNI_MODIFIER_LETTER;
378             case U_OTHER_LETTER :
379                 // Non_Spacing_Mark could not be as leading character
380                 if (nPos == 0) break;
381                 // fall through, treat it as Other_Letter.
382             case U_NON_SPACING_MARK :
383                 return KParseTokens::UNI_OTHER_LETTER;
384             case U_DECIMAL_DIGIT_NUMBER :
385                 return KParseTokens::UNI_DIGIT;
386             case U_LETTER_NUMBER :
387                 return KParseTokens::UNI_LETTER_NUMBER;
388             case U_OTHER_NUMBER :
389                 return KParseTokens::UNI_OTHER_NUMBER;
390         }
391 
392         return KParseTokens::UNI_OTHER;
393     }
394 }
395 
setupInternational(const Locale & rLocale)396 sal_Bool cclass_Unicode::setupInternational( const Locale& rLocale )
397 {
398     sal_Bool bChanged = (aParserLocale.Language != rLocale.Language
399         || aParserLocale.Country != rLocale.Country
400         || aParserLocale.Variant != rLocale.Variant);
401     if ( bChanged )
402     {
403         aParserLocale.Language = rLocale.Language;
404         aParserLocale.Country = rLocale.Country;
405         aParserLocale.Variant = rLocale.Variant;
406     }
407     if ( !xLocaleData.is() && xMSF.is() )
408     {
409         Reference <
410             XInterface > xI =
411             xMSF->createInstance( OUString(
412             RTL_CONSTASCII_USTRINGPARAM( "com.sun.star.i18n.LocaleData" ) ) );
413         if ( xI.is() )
414         {
415             Any x = xI->queryInterface( getCppuType((const Reference< XLocaleData>*)0) );
416             x >>= xLocaleData;
417         }
418     }
419     return bChanged;
420 }
421 
422 
setupParserTable(const Locale & rLocale,sal_Int32 startCharTokenType,const OUString & userDefinedCharactersStart,sal_Int32 contCharTokenType,const OUString & userDefinedCharactersCont)423 void cclass_Unicode::setupParserTable( const Locale& rLocale, sal_Int32 startCharTokenType,
424             const OUString& userDefinedCharactersStart, sal_Int32 contCharTokenType,
425             const OUString& userDefinedCharactersCont )
426 {
427     bool bIntlEqual = (rLocale.Language == aParserLocale.Language &&
428         rLocale.Country == aParserLocale.Country &&
429         rLocale.Variant == aParserLocale.Variant);
430     if ( !pTable || !bIntlEqual ||
431             startCharTokenType != nStartTypes ||
432             contCharTokenType != nContTypes ||
433             userDefinedCharactersStart != aStartChars ||
434             userDefinedCharactersCont != aContChars )
435         initParserTable( rLocale, startCharTokenType, userDefinedCharactersStart,
436             contCharTokenType, userDefinedCharactersCont );
437 }
438 
439 
initParserTable(const Locale & rLocale,sal_Int32 startCharTokenType,const OUString & userDefinedCharactersStart,sal_Int32 contCharTokenType,const OUString & userDefinedCharactersCont)440 void cclass_Unicode::initParserTable( const Locale& rLocale, sal_Int32 startCharTokenType,
441             const OUString& userDefinedCharactersStart, sal_Int32 contCharTokenType,
442             const OUString& userDefinedCharactersCont )
443 {
444     // (Re)Init
445     setupInternational( rLocale );
446     // Memory of pTable is reused.
447     if ( !pTable )
448         pTable = new UPT_FLAG_TYPE[nDefCnt];
449     memcpy( pTable, pDefaultParserTable, sizeof(UPT_FLAG_TYPE) * nDefCnt );
450     // Start and cont tables only need reallocation if different length.
451     if ( pStart && userDefinedCharactersStart.getLength() != aStartChars.getLength() )
452     {
453         delete [] pStart;
454         pStart = NULL;
455     }
456     if ( pCont && userDefinedCharactersCont.getLength() != aContChars.getLength() )
457     {
458         delete [] pCont;
459         pCont = NULL;
460     }
461     nStartTypes = startCharTokenType;
462     nContTypes = contCharTokenType;
463     aStartChars = userDefinedCharactersStart;
464     aContChars = userDefinedCharactersCont;
465 
466     // specials
467     if( xLocaleData.is() )
468     {
469         LocaleDataItem aItem =
470             xLocaleData->getLocaleItem( aParserLocale );
471 //!TODO: theoretically separators may be a string, adjustment would have to be
472 //! done here and in parsing and in ::rtl::math::stringToDouble()
473         cGroupSep = aItem.thousandSeparator.getStr()[0];
474         cDecimalSep = aItem.decimalSeparator.getStr()[0];
475     }
476 
477     if ( cGroupSep < nDefCnt )
478         pTable[cGroupSep] |= TOKEN_VALUE;
479     if ( cDecimalSep < nDefCnt )
480         pTable[cDecimalSep] |= TOKEN_CHAR_VALUE | TOKEN_VALUE;
481 
482     // Modify characters according to KParseTokens definitions.
483     {
484         using namespace KParseTokens;
485         sal_uInt8 i;
486 
487         if ( !(nStartTypes & ASC_UPALPHA) )
488             for ( i = 65; i < 91; i++ )
489                 pTable[i] &= ~TOKEN_CHAR_WORD;  // not allowed as start character
490         if ( !(nContTypes & ASC_UPALPHA) )
491             for ( i = 65; i < 91; i++ )
492                 pTable[i] &= ~TOKEN_WORD;       // not allowed as cont character
493 
494         if ( !(nStartTypes & ASC_LOALPHA) )
495             for ( i = 97; i < 123; i++ )
496                 pTable[i] &= ~TOKEN_CHAR_WORD;  // not allowed as start character
497         if ( !(nContTypes & ASC_LOALPHA) )
498             for ( i = 97; i < 123; i++ )
499                 pTable[i] &= ~TOKEN_WORD;       // not allowed as cont character
500 
501         if ( nStartTypes & ASC_DIGIT )
502             for ( i = 48; i < 58; i++ )
503                 pTable[i] |= TOKEN_CHAR_WORD;   // allowed as start character
504         if ( !(nContTypes & ASC_DIGIT) )
505             for ( i = 48; i < 58; i++ )
506                 pTable[i] &= ~TOKEN_WORD;       // not allowed as cont character
507 
508         if ( !(nStartTypes & ASC_UNDERSCORE) )
509             pTable[95] &= ~TOKEN_CHAR_WORD;     // not allowed as start character
510         if ( !(nContTypes & ASC_UNDERSCORE) )
511             pTable[95] &= ~TOKEN_WORD;          // not allowed as cont character
512 
513         if ( nStartTypes & ASC_DOLLAR )
514             pTable[36] |= TOKEN_CHAR_WORD;      // allowed as start character
515         if ( nContTypes & ASC_DOLLAR )
516             pTable[36] |= TOKEN_WORD;           // allowed as cont character
517 
518         if ( nStartTypes & ASC_DOT )
519             pTable[46] |= TOKEN_CHAR_WORD;      // allowed as start character
520         if ( nContTypes & ASC_DOT )
521             pTable[46] |= TOKEN_WORD;           // allowed as cont character
522 
523         if ( nStartTypes & ASC_COLON )
524             pTable[58] |= TOKEN_CHAR_WORD;      // allowed as start character
525         if ( nContTypes & ASC_COLON )
526             pTable[58] |= TOKEN_WORD;           // allowed as cont character
527 
528         if ( nStartTypes & ASC_CONTROL )
529             for ( i = 1; i < 32; i++ )
530                 pTable[i] |= TOKEN_CHAR_WORD;   // allowed as start character
531         if ( nContTypes & ASC_CONTROL )
532             for ( i = 1; i < 32; i++ )
533                 pTable[i] |= TOKEN_WORD;        // allowed as cont character
534 
535         if ( nStartTypes & ASC_ANY_BUT_CONTROL )
536             for ( i = 32; i < nDefCnt; i++ )
537                 pTable[i] |= TOKEN_CHAR_WORD;   // allowed as start character
538         if ( nContTypes & ASC_ANY_BUT_CONTROL )
539             for ( i = 32; i < nDefCnt; i++ )
540                 pTable[i] |= TOKEN_WORD;        // allowed as cont character
541 
542     }
543 
544     // Merge in (positively override with) user defined characters.
545     // StartChars
546     sal_Int32 nLen = aStartChars.getLength();
547     if ( nLen )
548     {
549         if ( !pStart )
550             pStart = new UPT_FLAG_TYPE[ nLen ];
551         const sal_Unicode* p = aStartChars.getStr();
552         for ( sal_Int32 j=0; j<nLen; j++, p++ )
553         {
554             pStart[j] = TOKEN_CHAR_WORD;
555             if ( *p < nDefCnt )
556                 pTable[*p] |= TOKEN_CHAR_WORD;
557         }
558     }
559     // ContChars
560     nLen = aContChars.getLength();
561     if ( nLen )
562     {
563         if ( !pCont )
564             pCont = new UPT_FLAG_TYPE[ nLen ];
565         const sal_Unicode* p = aContChars.getStr();
566         for ( sal_Int32 j=0; j<nLen; j++ )
567         {
568             pCont[j] = TOKEN_WORD;
569             if ( *p < nDefCnt )
570                 pTable[*p] |= TOKEN_WORD;
571         }
572     }
573 }
574 
575 
destroyParserTable()576 void cclass_Unicode::destroyParserTable()
577 {
578     if ( pCont )
579         delete [] pCont;
580     if ( pStart )
581         delete [] pStart;
582     if ( pTable )
583         delete [] pTable;
584 }
585 
586 
getFlags(const sal_Unicode * aStr,sal_Int32 nPos)587 UPT_FLAG_TYPE cclass_Unicode::getFlags( const sal_Unicode* aStr, sal_Int32 nPos )
588 {
589     UPT_FLAG_TYPE nMask;
590     sal_Unicode c = aStr[nPos];
591     if ( c < nDefCnt )
592         nMask = pTable[ sal_uInt8(c) ];
593     else
594         nMask = getFlagsExtended( aStr, nPos );
595     switch ( eState )
596     {
597         case ssGetChar :
598         case ssRewindFromValue :
599         case ssIgnoreLeadingInRewind :
600         case ssGetWordFirstChar :
601             if ( !(nMask & TOKEN_CHAR_WORD) )
602             {
603                 nMask |= getStartCharsFlags( c );
604                 if ( nMask & TOKEN_CHAR_WORD )
605                     nMask &= ~TOKEN_EXCLUDED;
606             }
607         break;
608         case ssGetValue :
609         case ssGetWord :
610             if ( !(nMask & TOKEN_WORD) )
611             {
612                 nMask |= getContCharsFlags( c );
613                 if ( nMask & TOKEN_WORD )
614                     nMask &= ~TOKEN_EXCLUDED;
615             }
616         break;
617         default:
618             ;   // other cases aren't needed, no compiler warning
619     }
620     return nMask;
621 }
622 
623 
getFlagsExtended(const sal_Unicode * aStr,sal_Int32 nPos)624 UPT_FLAG_TYPE cclass_Unicode::getFlagsExtended( const sal_Unicode* aStr, sal_Int32 nPos )
625 {
626     sal_Unicode c = aStr[nPos];
627     if ( c == cGroupSep )
628         return TOKEN_VALUE;
629     else if ( c == cDecimalSep )
630         return TOKEN_CHAR_VALUE | TOKEN_VALUE;
631     using namespace i18n;
632     bool bStart = (eState == ssGetChar || eState == ssGetWordFirstChar ||
633             eState == ssRewindFromValue || eState == ssIgnoreLeadingInRewind);
634     sal_Int32 nTypes = (bStart ? nStartTypes : nContTypes);
635 
636     //! all KParseTokens::UNI_... must be matched
637     switch ( u_charType( (sal_uInt32) c ) )
638     {
639         case U_UPPERCASE_LETTER :
640             return (nTypes & KParseTokens::UNI_UPALPHA) ?
641                 (bStart ? TOKEN_CHAR_WORD : TOKEN_WORD) :
642                 TOKEN_ILLEGAL;
643         case U_LOWERCASE_LETTER :
644             return (nTypes & KParseTokens::UNI_LOALPHA) ?
645                 (bStart ? TOKEN_CHAR_WORD : TOKEN_WORD) :
646                 TOKEN_ILLEGAL;
647         case U_TITLECASE_LETTER :
648             return (nTypes & KParseTokens::UNI_TITLE_ALPHA) ?
649                 (bStart ? TOKEN_CHAR_WORD : TOKEN_WORD) :
650                 TOKEN_ILLEGAL;
651         case U_MODIFIER_LETTER :
652             return (nTypes & KParseTokens::UNI_MODIFIER_LETTER) ?
653                 (bStart ? TOKEN_CHAR_WORD : TOKEN_WORD) :
654                 TOKEN_ILLEGAL;
655         case U_NON_SPACING_MARK :
656         case U_COMBINING_SPACING_MARK :
657             // Non_Spacing_Mark can't be a leading character,
658             // nor can a spacing combining mark.
659             if (bStart)
660                 return TOKEN_ILLEGAL;
661             // fall through, treat it as Other_Letter.
662         case U_OTHER_LETTER :
663             return (nTypes & KParseTokens::UNI_OTHER_LETTER) ?
664                 (bStart ? TOKEN_CHAR_WORD : TOKEN_WORD) :
665                 TOKEN_ILLEGAL;
666         case U_DECIMAL_DIGIT_NUMBER :
667             return ((nTypes & KParseTokens::UNI_DIGIT) ?
668                 (bStart ? TOKEN_CHAR_WORD : TOKEN_WORD) :
669                 TOKEN_ILLEGAL) | TOKEN_DIGIT_FLAGS;
670         case U_LETTER_NUMBER :
671             return ((nTypes & KParseTokens::UNI_LETTER_NUMBER) ?
672                 (bStart ? TOKEN_CHAR_WORD : TOKEN_WORD) :
673                 TOKEN_ILLEGAL) | TOKEN_DIGIT_FLAGS;
674         case U_OTHER_NUMBER :
675             return ((nTypes & KParseTokens::UNI_OTHER_NUMBER) ?
676                 (bStart ? TOKEN_CHAR_WORD : TOKEN_WORD) :
677                 TOKEN_ILLEGAL) | TOKEN_DIGIT_FLAGS;
678         case U_SPACE_SEPARATOR :
679             return ((nTypes & KParseTokens::IGNORE_LEADING_WS) ?
680                 TOKEN_CHAR_DONTCARE : (bStart ? TOKEN_CHAR_WORD : (TOKEN_CHAR_DONTCARE | TOKEN_WORD_SEP | TOKEN_VALUE_SEP) ));
681     }
682 
683     return TOKEN_ILLEGAL;
684 }
685 
686 
getStartCharsFlags(sal_Unicode c)687 UPT_FLAG_TYPE cclass_Unicode::getStartCharsFlags( sal_Unicode c )
688 {
689     if ( pStart )
690     {
691         const sal_Unicode* pStr = aStartChars.getStr();
692         const sal_Unicode* p = StrChr( pStr, c );
693         if ( p )
694             return pStart[ p - pStr ];
695     }
696     return TOKEN_ILLEGAL;
697 }
698 
699 
getContCharsFlags(sal_Unicode c)700 UPT_FLAG_TYPE cclass_Unicode::getContCharsFlags( sal_Unicode c )
701 {
702     if ( pCont )
703     {
704         const sal_Unicode* pStr = aContChars.getStr();
705         const sal_Unicode* p = StrChr( pStr, c );
706         if ( p )
707             return pCont[ p - pStr ];
708     }
709     return TOKEN_ILLEGAL;
710 }
711 
712 
parseText(ParseResult & r,const OUString & rText,sal_Int32 nPos,sal_Int32 nTokenType)713 void cclass_Unicode::parseText( ParseResult& r, const OUString& rText, sal_Int32 nPos, sal_Int32 nTokenType )
714 {
715     using namespace i18n;
716     const sal_Unicode* const pTextStart = rText.getStr() + nPos;
717     eState = ssGetChar;
718 
719     //! All the variables below (plus ParseResult) have to be resetted on ssRewindFromValue!
720     const sal_Unicode* pSym = pTextStart;
721     const sal_Unicode* pSrc = pSym;
722     OUString aSymbol;
723     sal_Unicode c = *pSrc;
724     sal_Unicode cLast = 0;
725     int nDecSeps = 0;
726     bool bQuote = false;
727     bool bMightBeWord = true;
728     bool bMightBeWordLast = true;
729     //! All the variables above (plus ParseResult) have to be resetted on ssRewindFromValue!
730 
731     while ( (c != 0) && (eState != ssStop) )
732     {
733         UPT_FLAG_TYPE nMask = getFlags( pTextStart, pSrc - pTextStart );
734         if ( nMask & TOKEN_EXCLUDED )
735             eState = ssBounce;
736         if ( bMightBeWord )
737         {   // only relevant for ssGetValue fall back
738             if ( eState == ssGetChar || eState == ssRewindFromValue ||
739                     eState == ssIgnoreLeadingInRewind )
740                 bMightBeWord = ((nMask & TOKEN_CHAR_WORD) != 0);
741             else
742                 bMightBeWord = ((nMask & TOKEN_WORD) != 0);
743         }
744         sal_Int32 nParseTokensType = getParseTokensType( pTextStart, pSrc - pTextStart );
745         pSrc++;
746         switch (eState)
747         {
748             case ssGetChar :
749             case ssRewindFromValue :
750             case ssIgnoreLeadingInRewind :
751             {
752                 if ( (nMask & TOKEN_CHAR_VALUE) && eState != ssRewindFromValue
753                         && eState != ssIgnoreLeadingInRewind )
754                 {   //! must be first, may fall back to ssGetWord via bMightBeWord
755                     eState = ssGetValue;
756                     if ( nMask & TOKEN_VALUE_DIGIT )
757                     {
758                         if ( 128 <= c )
759                             r.TokenType = KParseType::UNI_NUMBER;
760                         else
761                             r.TokenType = KParseType::ASC_NUMBER;
762                     }
763                     else if ( c == cDecimalSep )
764                     {
765                         if ( *pSrc )
766                             ++nDecSeps;
767                         else
768                             eState = ssRewindFromValue;
769                             // retry for ONE_SINGLE_CHAR or others
770                     }
771                 }
772                 else if ( nMask & TOKEN_CHAR_WORD )
773                 {
774                     eState = ssGetWord;
775                     r.TokenType = KParseType::IDENTNAME;
776                 }
777                 else if ( nMask & TOKEN_NAME_SEP )
778                 {
779                     eState = ssGetWordFirstChar;
780                     bQuote = true;
781                     pSym++;
782                     nParseTokensType = 0;   // will be taken of first real character
783                     r.TokenType = KParseType::SINGLE_QUOTE_NAME;
784                 }
785                 else if ( nMask & TOKEN_CHAR_STRING )
786                 {
787                     eState = ssGetString;
788                     pSym++;
789                     nParseTokensType = 0;   // will be taken of first real character
790                     r.TokenType = KParseType::DOUBLE_QUOTE_STRING;
791                 }
792                 else if ( nMask & TOKEN_CHAR_DONTCARE )
793                 {
794                     if ( nStartTypes & KParseTokens::IGNORE_LEADING_WS )
795                     {
796                         if (eState == ssRewindFromValue)
797                             eState = ssIgnoreLeadingInRewind;
798                         r.LeadingWhiteSpace++;
799                         pSym++;
800                         nParseTokensType = 0;   // wait until real character
801                         bMightBeWord = true;
802                     }
803                     else
804                         eState = ssBounce;
805                 }
806                 else if ( nMask & TOKEN_CHAR_BOOL )
807                 {
808                     eState = ssGetBool;
809                     r.TokenType = KParseType::BOOLEAN;
810                 }
811                 else if ( nMask & TOKEN_CHAR )
812                 {   //! must be last
813                     eState = ssStop;
814                     r.TokenType = KParseType::ONE_SINGLE_CHAR;
815                 }
816                 else
817                     eState = ssBounce;      // not known
818             }
819             break;
820             case ssGetValue :
821             {
822                 if ( nMask & TOKEN_VALUE_DIGIT )
823                 {
824                     if ( 128 <= c )
825                         r.TokenType = KParseType::UNI_NUMBER;
826                     else if ( r.TokenType != KParseType::UNI_NUMBER )
827                         r.TokenType = KParseType::ASC_NUMBER;
828                 }
829                 if ( nMask & TOKEN_VALUE )
830                 {
831                     if ( c == cDecimalSep && ++nDecSeps > 1 )
832                     {
833                         if ( pSrc - pTextStart == 2 )
834                             eState = ssRewindFromValue;
835                             // consecutive separators
836                         else
837                             eState = ssStopBack;
838                     }
839                     // else keep it going
840                 }
841                 else if ( c == 'E' || c == 'e' )
842                 {
843                     UPT_FLAG_TYPE nNext = getFlags( pTextStart, pSrc - pTextStart );
844                     if ( nNext & TOKEN_VALUE_EXP )
845                         ;   // keep it going
846                     else if ( bMightBeWord && ((nNext & TOKEN_WORD) || !*pSrc) )
847                     {   // might be a numerical name (1.2efg)
848                         eState = ssGetWord;
849                         r.TokenType = KParseType::IDENTNAME;
850                     }
851                     else
852                         eState = ssStopBack;
853                 }
854                 else if ( nMask & TOKEN_VALUE_SIGN )
855                 {
856                     if ( (cLast == 'E') || (cLast == 'e') )
857                     {
858                         UPT_FLAG_TYPE nNext = getFlags( pTextStart, pSrc - pTextStart );
859                         if ( nNext & TOKEN_VALUE_EXP_VALUE )
860                             ;   // keep it going
861                         else if ( bMightBeWord && ((nNext & TOKEN_WORD) || !*pSrc) )
862                         {   // might be a numerical name (1.2e+fg)
863                             eState = ssGetWord;
864                             r.TokenType = KParseType::IDENTNAME;
865                         }
866                         else
867                             eState = ssStopBack;
868                     }
869                     else if ( bMightBeWord )
870                     {   // might be a numerical name (1.2+fg)
871                         eState = ssGetWord;
872                         r.TokenType = KParseType::IDENTNAME;
873                     }
874                     else
875                         eState = ssStopBack;
876                 }
877                 else if ( bMightBeWord && (nMask & TOKEN_WORD) )
878                 {   // might be a numerical name (1995.A1)
879                     eState = ssGetWord;
880                     r.TokenType = KParseType::IDENTNAME;
881                 }
882                 else
883                     eState = ssStopBack;
884             }
885             break;
886             case ssGetWordFirstChar :
887                 eState = ssGetWord;
888                 // fall thru
889             case ssGetWord :
890             {
891                 if ( nMask & TOKEN_WORD )
892                     ;   // keep it going
893                 else if ( nMask & TOKEN_NAME_SEP )
894                 {
895                     if ( bQuote )
896                     {
897                         if ( cLast == '\\' )
898                         {   // escaped
899                             aSymbol += OUString( pSym, pSrc - pSym - 2 );
900                             aSymbol += OUString( &c, 1);
901                         }
902                         else
903                         {
904                             eState = ssStop;
905                             aSymbol += OUString( pSym, pSrc - pSym - 1 );
906                         }
907                         pSym = pSrc;
908                     }
909                     else
910                         eState = ssStopBack;
911                 }
912                 else if ( bQuote )
913                     ;   // keep it going
914                 else
915                     eState = ssStopBack;
916             }
917             break;
918             case ssGetString :
919             {
920                 if ( nMask & TOKEN_STRING_SEP )
921                 {
922                     if ( cLast == '\\' )
923                     {   // escaped
924                         aSymbol += OUString( pSym, pSrc - pSym - 2 );
925                         aSymbol += OUString( &c, 1);
926                     }
927                     else if ( c == *pSrc &&
928                             !(nContTypes & KParseTokens::TWO_DOUBLE_QUOTES_BREAK_STRING) )
929                     {   // "" => literal " escaped
930                         aSymbol += OUString( pSym, pSrc - pSym );
931                         pSrc++;
932                     }
933                     else
934                     {
935                         eState = ssStop;
936                         aSymbol += OUString( pSym, pSrc - pSym - 1 );
937                     }
938                     pSym = pSrc;
939                 }
940             }
941             break;
942             case ssGetBool :
943             {
944                 if ( (nMask & TOKEN_BOOL) )
945                     eState = ssStop;    // maximum 2: <, >, <>, <=, >=
946                 else
947                     eState = ssStopBack;
948             }
949             break;
950             case ssStopBack :
951             case ssBounce :
952             case ssStop :
953                 ;   // nothing, no compiler warning
954             break;
955         }
956         if ( eState == ssRewindFromValue )
957         {
958             r = ParseResult();
959             pSym = pTextStart;
960             pSrc = pSym;
961             aSymbol = OUString();
962             c = *pSrc;
963             cLast = 0;
964             nDecSeps = 0;
965             bQuote = false;
966             bMightBeWord = true;
967             bMightBeWordLast = true;
968         }
969         else
970         {
971             if ( !(r.TokenType & nTokenType) )
972             {
973                 if ( (r.TokenType & (KParseType::ASC_NUMBER | KParseType::UNI_NUMBER))
974                         && (nTokenType & KParseType::IDENTNAME) && bMightBeWord )
975                     ;   // keep a number that might be a word
976                 else if ( r.LeadingWhiteSpace == (pSrc - pTextStart) )
977                     ;   // keep ignored white space
978                 else if ( !r.TokenType && eState == ssGetValue && (nMask & TOKEN_VALUE_SEP) )
979                     ;   // keep uncertain value
980                 else
981                     eState = ssBounce;
982             }
983             if ( eState == ssBounce )
984             {
985                 r.TokenType = 0;
986                 eState = ssStopBack;
987             }
988             if ( eState == ssStopBack )
989             {   // put back
990                 pSrc--;
991                 bMightBeWord = bMightBeWordLast;
992                 eState = ssStop;
993             }
994             if ( eState != ssStop )
995             {
996                 if ( !r.StartFlags )
997                     r.StartFlags |= nParseTokensType;
998                 else
999                     r.ContFlags |= nParseTokensType;
1000             }
1001             bMightBeWordLast = bMightBeWord;
1002             cLast = c;
1003             c = *pSrc;
1004         }
1005     }
1006     // r.CharLen is the length in characters (not code points) of the parsed
1007     // token not including any leading white space, change this calculation if
1008     // multi-code-point Unicode characters are to be supported.
1009     r.CharLen = pSrc - pTextStart - r.LeadingWhiteSpace;
1010     r.EndPos = nPos + (pSrc - pTextStart);
1011     if ( r.TokenType & KParseType::ASC_NUMBER )
1012     {
1013         r.Value = rtl_math_uStringToDouble( pTextStart + r.LeadingWhiteSpace,
1014                 pTextStart + r.EndPos, cDecimalSep, cGroupSep, NULL, NULL );
1015         if ( bMightBeWord )
1016             r.TokenType |= KParseType::IDENTNAME;
1017     }
1018     else if ( r.TokenType & KParseType::UNI_NUMBER )
1019     {
1020         if ( !xNatNumSup.is() )
1021         {
1022 #define NATIVENUMBERSUPPLIER_SERVICENAME "com.sun.star.i18n.NativeNumberSupplier"
1023             if ( xMSF.is() )
1024             {
1025                 xNatNumSup = Reference< XNativeNumberSupplier > (
1026                         xMSF->createInstance( OUString(
1027                                 RTL_CONSTASCII_USTRINGPARAM(
1028                                     NATIVENUMBERSUPPLIER_SERVICENAME ) ) ),
1029                         UNO_QUERY );
1030             }
1031             if ( !xNatNumSup.is() )
1032             {
1033                 throw RuntimeException( OUString(
1034 #ifdef DBG_UTIL
1035                     RTL_CONSTASCII_USTRINGPARAM(
1036                         "cclass_Unicode::parseText: can't instanciate "
1037                         NATIVENUMBERSUPPLIER_SERVICENAME )
1038 #endif
1039                     ), *this );
1040             }
1041 #undef NATIVENUMBERSUPPLIER_SERVICENAME
1042         }
1043         OUString aTmp( pTextStart + r.LeadingWhiteSpace, r.EndPos - nPos +
1044                 r.LeadingWhiteSpace );
1045         // transliterate to ASCII
1046         aTmp = xNatNumSup->getNativeNumberString( aTmp, aParserLocale,
1047                 NativeNumberMode::NATNUM0 );
1048         r.Value = ::rtl::math::stringToDouble( aTmp, cDecimalSep, cGroupSep, NULL, NULL );
1049         if ( bMightBeWord )
1050             r.TokenType |= KParseType::IDENTNAME;
1051     }
1052     else if ( r.TokenType & (KParseType::SINGLE_QUOTE_NAME | KParseType::DOUBLE_QUOTE_STRING) )
1053     {
1054         if ( pSym < pSrc )
1055         {   //! open quote
1056             aSymbol += OUString( pSym, pSrc - pSym );
1057             r.TokenType |= KParseType::MISSING_QUOTE;
1058         }
1059         r.DequotedNameOrString = aSymbol;
1060     }
1061 }
1062 
1063 } } } }
1064