xref: /AOO41X/main/i18nutil/source/utility/unicode.cxx (revision cdf0e10c4e3984b49a9502b011690b615761d4a3)
1*cdf0e10cSrcweir /*************************************************************************
2*cdf0e10cSrcweir  *
3*cdf0e10cSrcweir  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4*cdf0e10cSrcweir  *
5*cdf0e10cSrcweir  * Copyright 2000, 2010 Oracle and/or its affiliates.
6*cdf0e10cSrcweir  *
7*cdf0e10cSrcweir  * OpenOffice.org - a multi-platform office productivity suite
8*cdf0e10cSrcweir  *
9*cdf0e10cSrcweir  * This file is part of OpenOffice.org.
10*cdf0e10cSrcweir  *
11*cdf0e10cSrcweir  * OpenOffice.org is free software: you can redistribute it and/or modify
12*cdf0e10cSrcweir  * it under the terms of the GNU Lesser General Public License version 3
13*cdf0e10cSrcweir  * only, as published by the Free Software Foundation.
14*cdf0e10cSrcweir  *
15*cdf0e10cSrcweir  * OpenOffice.org is distributed in the hope that it will be useful,
16*cdf0e10cSrcweir  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17*cdf0e10cSrcweir  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18*cdf0e10cSrcweir  * GNU Lesser General Public License version 3 for more details
19*cdf0e10cSrcweir  * (a copy is included in the LICENSE file that accompanied this code).
20*cdf0e10cSrcweir  *
21*cdf0e10cSrcweir  * You should have received a copy of the GNU Lesser General Public License
22*cdf0e10cSrcweir  * version 3 along with OpenOffice.org.  If not, see
23*cdf0e10cSrcweir  * <http://www.openoffice.org/license.html>
24*cdf0e10cSrcweir  * for a copy of the LGPLv3 License.
25*cdf0e10cSrcweir  *
26*cdf0e10cSrcweir  ************************************************************************/
27*cdf0e10cSrcweir 
28*cdf0e10cSrcweir #include <com/sun/star/i18n/UnicodeType.hpp>
29*cdf0e10cSrcweir #include <com/sun/star/i18n/KCharacterType.hpp>
30*cdf0e10cSrcweir #include <i18nutil/unicode.hxx>
31*cdf0e10cSrcweir #include "unicode_data.h"
32*cdf0e10cSrcweir 
33*cdf0e10cSrcweir using namespace ::com::sun::star::i18n;
34*cdf0e10cSrcweir 
35*cdf0e10cSrcweir static ScriptTypeList defaultTypeList[] = {
36*cdf0e10cSrcweir     { UnicodeScript_kBasicLatin,
37*cdf0e10cSrcweir       UnicodeScript_kBasicLatin,
38*cdf0e10cSrcweir       UnicodeScript_kBasicLatin },      // 0,
39*cdf0e10cSrcweir     { UnicodeScript_kLatin1Supplement,
40*cdf0e10cSrcweir       UnicodeScript_kLatin1Supplement,
41*cdf0e10cSrcweir       UnicodeScript_kLatin1Supplement },// 1,
42*cdf0e10cSrcweir     { UnicodeScript_kLatinExtendedA,
43*cdf0e10cSrcweir       UnicodeScript_kLatinExtendedA,
44*cdf0e10cSrcweir       UnicodeScript_kLatinExtendedA }, // 2,
45*cdf0e10cSrcweir     { UnicodeScript_kLatinExtendedB,
46*cdf0e10cSrcweir       UnicodeScript_kLatinExtendedB,
47*cdf0e10cSrcweir       UnicodeScript_kLatinExtendedB }, // 3,
48*cdf0e10cSrcweir     { UnicodeScript_kIPAExtension,
49*cdf0e10cSrcweir       UnicodeScript_kIPAExtension,
50*cdf0e10cSrcweir       UnicodeScript_kIPAExtension }, // 4,
51*cdf0e10cSrcweir     { UnicodeScript_kSpacingModifier,
52*cdf0e10cSrcweir       UnicodeScript_kSpacingModifier,
53*cdf0e10cSrcweir       UnicodeScript_kSpacingModifier }, // 5,
54*cdf0e10cSrcweir     { UnicodeScript_kCombiningDiacritical,
55*cdf0e10cSrcweir       UnicodeScript_kCombiningDiacritical,
56*cdf0e10cSrcweir       UnicodeScript_kCombiningDiacritical }, // 6,
57*cdf0e10cSrcweir     { UnicodeScript_kGreek,
58*cdf0e10cSrcweir       UnicodeScript_kGreek,
59*cdf0e10cSrcweir       UnicodeScript_kGreek }, // 7,
60*cdf0e10cSrcweir     { UnicodeScript_kCyrillic,
61*cdf0e10cSrcweir       UnicodeScript_kCyrillic,
62*cdf0e10cSrcweir       UnicodeScript_kCyrillic }, // 8,
63*cdf0e10cSrcweir     { UnicodeScript_kArmenian,
64*cdf0e10cSrcweir       UnicodeScript_kArmenian,
65*cdf0e10cSrcweir       UnicodeScript_kArmenian }, // 9,
66*cdf0e10cSrcweir     { UnicodeScript_kHebrew,
67*cdf0e10cSrcweir       UnicodeScript_kHebrew,
68*cdf0e10cSrcweir       UnicodeScript_kHebrew }, // 10,
69*cdf0e10cSrcweir     { UnicodeScript_kArabic,
70*cdf0e10cSrcweir       UnicodeScript_kArabic,
71*cdf0e10cSrcweir       UnicodeScript_kArabic }, // 11,
72*cdf0e10cSrcweir     { UnicodeScript_kSyriac,
73*cdf0e10cSrcweir       UnicodeScript_kSyriac,
74*cdf0e10cSrcweir       UnicodeScript_kSyriac }, // 12,
75*cdf0e10cSrcweir     { UnicodeScript_kThaana,
76*cdf0e10cSrcweir       UnicodeScript_kThaana,
77*cdf0e10cSrcweir       UnicodeScript_kThaana }, // 13,
78*cdf0e10cSrcweir     { UnicodeScript_kDevanagari,
79*cdf0e10cSrcweir       UnicodeScript_kDevanagari,
80*cdf0e10cSrcweir       UnicodeScript_kDevanagari }, // 14,
81*cdf0e10cSrcweir     { UnicodeScript_kBengali,
82*cdf0e10cSrcweir       UnicodeScript_kBengali,
83*cdf0e10cSrcweir       UnicodeScript_kBengali }, // 15,
84*cdf0e10cSrcweir     { UnicodeScript_kGurmukhi,
85*cdf0e10cSrcweir       UnicodeScript_kGurmukhi,
86*cdf0e10cSrcweir       UnicodeScript_kGurmukhi }, // 16,
87*cdf0e10cSrcweir     { UnicodeScript_kGujarati,
88*cdf0e10cSrcweir       UnicodeScript_kGujarati,
89*cdf0e10cSrcweir       UnicodeScript_kGujarati }, // 17,
90*cdf0e10cSrcweir     { UnicodeScript_kOriya,
91*cdf0e10cSrcweir       UnicodeScript_kOriya,
92*cdf0e10cSrcweir       UnicodeScript_kOriya }, // 18,
93*cdf0e10cSrcweir     { UnicodeScript_kTamil,
94*cdf0e10cSrcweir       UnicodeScript_kTamil,
95*cdf0e10cSrcweir       UnicodeScript_kTamil }, // 19,
96*cdf0e10cSrcweir     { UnicodeScript_kTelugu,
97*cdf0e10cSrcweir       UnicodeScript_kTelugu,
98*cdf0e10cSrcweir       UnicodeScript_kTelugu }, // 20,
99*cdf0e10cSrcweir     { UnicodeScript_kKannada,
100*cdf0e10cSrcweir       UnicodeScript_kKannada,
101*cdf0e10cSrcweir       UnicodeScript_kKannada }, // 21,
102*cdf0e10cSrcweir     { UnicodeScript_kMalayalam,
103*cdf0e10cSrcweir       UnicodeScript_kMalayalam,
104*cdf0e10cSrcweir       UnicodeScript_kMalayalam }, // 22,
105*cdf0e10cSrcweir     { UnicodeScript_kSinhala,
106*cdf0e10cSrcweir       UnicodeScript_kSinhala,
107*cdf0e10cSrcweir       UnicodeScript_kSinhala }, // 23,
108*cdf0e10cSrcweir     { UnicodeScript_kThai,
109*cdf0e10cSrcweir       UnicodeScript_kThai,
110*cdf0e10cSrcweir       UnicodeScript_kThai }, // 24,
111*cdf0e10cSrcweir     { UnicodeScript_kLao,
112*cdf0e10cSrcweir       UnicodeScript_kLao,
113*cdf0e10cSrcweir       UnicodeScript_kLao }, // 25,
114*cdf0e10cSrcweir     { UnicodeScript_kTibetan,
115*cdf0e10cSrcweir       UnicodeScript_kTibetan,
116*cdf0e10cSrcweir       UnicodeScript_kTibetan }, // 26,
117*cdf0e10cSrcweir     { UnicodeScript_kMyanmar,
118*cdf0e10cSrcweir       UnicodeScript_kMyanmar,
119*cdf0e10cSrcweir       UnicodeScript_kMyanmar }, // 27,
120*cdf0e10cSrcweir     { UnicodeScript_kGeorgian,
121*cdf0e10cSrcweir       UnicodeScript_kGeorgian,
122*cdf0e10cSrcweir       UnicodeScript_kGeorgian }, // 28,
123*cdf0e10cSrcweir     { UnicodeScript_kHangulJamo,
124*cdf0e10cSrcweir       UnicodeScript_kHangulJamo,
125*cdf0e10cSrcweir       UnicodeScript_kHangulJamo }, // 29,
126*cdf0e10cSrcweir     { UnicodeScript_kEthiopic,
127*cdf0e10cSrcweir       UnicodeScript_kEthiopic,
128*cdf0e10cSrcweir       UnicodeScript_kEthiopic }, // 30,
129*cdf0e10cSrcweir     { UnicodeScript_kCherokee,
130*cdf0e10cSrcweir       UnicodeScript_kCherokee,
131*cdf0e10cSrcweir       UnicodeScript_kCherokee }, // 31,
132*cdf0e10cSrcweir     { UnicodeScript_kUnifiedCanadianAboriginalSyllabics,
133*cdf0e10cSrcweir       UnicodeScript_kUnifiedCanadianAboriginalSyllabics,
134*cdf0e10cSrcweir       UnicodeScript_kUnifiedCanadianAboriginalSyllabics }, // 32,
135*cdf0e10cSrcweir     { UnicodeScript_kOgham,
136*cdf0e10cSrcweir       UnicodeScript_kOgham,
137*cdf0e10cSrcweir       UnicodeScript_kOgham }, // 33,
138*cdf0e10cSrcweir     { UnicodeScript_kRunic,
139*cdf0e10cSrcweir       UnicodeScript_kRunic,
140*cdf0e10cSrcweir       UnicodeScript_kRunic }, // 34,
141*cdf0e10cSrcweir     { UnicodeScript_kKhmer,
142*cdf0e10cSrcweir       UnicodeScript_kKhmer,
143*cdf0e10cSrcweir       UnicodeScript_kKhmer }, // 35,
144*cdf0e10cSrcweir     { UnicodeScript_kMongolian,
145*cdf0e10cSrcweir       UnicodeScript_kMongolian,
146*cdf0e10cSrcweir       UnicodeScript_kMongolian }, // 36,
147*cdf0e10cSrcweir     { UnicodeScript_kLatinExtendedAdditional,
148*cdf0e10cSrcweir       UnicodeScript_kLatinExtendedAdditional,
149*cdf0e10cSrcweir       UnicodeScript_kLatinExtendedAdditional }, // 37,
150*cdf0e10cSrcweir     { UnicodeScript_kGreekExtended,
151*cdf0e10cSrcweir       UnicodeScript_kGreekExtended,
152*cdf0e10cSrcweir       UnicodeScript_kGreekExtended }, // 38,
153*cdf0e10cSrcweir     { UnicodeScript_kGeneralPunctuation,
154*cdf0e10cSrcweir       UnicodeScript_kGeneralPunctuation,
155*cdf0e10cSrcweir       UnicodeScript_kGeneralPunctuation }, // 39,
156*cdf0e10cSrcweir     { UnicodeScript_kSuperSubScript,
157*cdf0e10cSrcweir       UnicodeScript_kSuperSubScript,
158*cdf0e10cSrcweir       UnicodeScript_kSuperSubScript }, // 40,
159*cdf0e10cSrcweir     { UnicodeScript_kCurrencySymbolScript,
160*cdf0e10cSrcweir       UnicodeScript_kCurrencySymbolScript,
161*cdf0e10cSrcweir       UnicodeScript_kCurrencySymbolScript }, // 41,
162*cdf0e10cSrcweir     { UnicodeScript_kSymbolCombiningMark,
163*cdf0e10cSrcweir       UnicodeScript_kSymbolCombiningMark,
164*cdf0e10cSrcweir       UnicodeScript_kSymbolCombiningMark }, // 42,
165*cdf0e10cSrcweir     { UnicodeScript_kLetterlikeSymbol,
166*cdf0e10cSrcweir       UnicodeScript_kLetterlikeSymbol,
167*cdf0e10cSrcweir       UnicodeScript_kLetterlikeSymbol }, // 43,
168*cdf0e10cSrcweir     { UnicodeScript_kNumberForm,
169*cdf0e10cSrcweir       UnicodeScript_kNumberForm,
170*cdf0e10cSrcweir       UnicodeScript_kNumberForm }, // 44,
171*cdf0e10cSrcweir     { UnicodeScript_kArrow,
172*cdf0e10cSrcweir       UnicodeScript_kArrow,
173*cdf0e10cSrcweir       UnicodeScript_kArrow }, // 45,
174*cdf0e10cSrcweir     { UnicodeScript_kMathOperator,
175*cdf0e10cSrcweir       UnicodeScript_kMathOperator,
176*cdf0e10cSrcweir       UnicodeScript_kMathOperator }, // 46,
177*cdf0e10cSrcweir     { UnicodeScript_kMiscTechnical,
178*cdf0e10cSrcweir       UnicodeScript_kMiscTechnical,
179*cdf0e10cSrcweir       UnicodeScript_kMiscTechnical }, // 47,
180*cdf0e10cSrcweir     { UnicodeScript_kControlPicture,
181*cdf0e10cSrcweir       UnicodeScript_kControlPicture,
182*cdf0e10cSrcweir       UnicodeScript_kControlPicture }, // 48,
183*cdf0e10cSrcweir     { UnicodeScript_kOpticalCharacter,
184*cdf0e10cSrcweir       UnicodeScript_kOpticalCharacter,
185*cdf0e10cSrcweir       UnicodeScript_kOpticalCharacter }, // 49,
186*cdf0e10cSrcweir     { UnicodeScript_kEnclosedAlphanumeric,
187*cdf0e10cSrcweir       UnicodeScript_kEnclosedAlphanumeric,
188*cdf0e10cSrcweir       UnicodeScript_kEnclosedAlphanumeric }, // 50,
189*cdf0e10cSrcweir     { UnicodeScript_kBoxDrawing,
190*cdf0e10cSrcweir       UnicodeScript_kBoxDrawing,
191*cdf0e10cSrcweir       UnicodeScript_kBoxDrawing }, // 51,
192*cdf0e10cSrcweir     { UnicodeScript_kBlockElement,
193*cdf0e10cSrcweir       UnicodeScript_kBlockElement,
194*cdf0e10cSrcweir       UnicodeScript_kBlockElement }, // 52,
195*cdf0e10cSrcweir     { UnicodeScript_kGeometricShape,
196*cdf0e10cSrcweir       UnicodeScript_kGeometricShape,
197*cdf0e10cSrcweir       UnicodeScript_kGeometricShape }, // 53,
198*cdf0e10cSrcweir     { UnicodeScript_kMiscSymbol,
199*cdf0e10cSrcweir       UnicodeScript_kMiscSymbol,
200*cdf0e10cSrcweir       UnicodeScript_kMiscSymbol }, // 54,
201*cdf0e10cSrcweir     { UnicodeScript_kDingbat,
202*cdf0e10cSrcweir       UnicodeScript_kDingbat,
203*cdf0e10cSrcweir       UnicodeScript_kDingbat }, // 55,
204*cdf0e10cSrcweir     { UnicodeScript_kBraillePatterns,
205*cdf0e10cSrcweir       UnicodeScript_kBraillePatterns,
206*cdf0e10cSrcweir       UnicodeScript_kBraillePatterns }, // 56,
207*cdf0e10cSrcweir     { UnicodeScript_kCJKRadicalsSupplement,
208*cdf0e10cSrcweir       UnicodeScript_kCJKRadicalsSupplement,
209*cdf0e10cSrcweir       UnicodeScript_kCJKRadicalsSupplement }, // 57,
210*cdf0e10cSrcweir     { UnicodeScript_kKangxiRadicals,
211*cdf0e10cSrcweir       UnicodeScript_kKangxiRadicals,
212*cdf0e10cSrcweir       UnicodeScript_kKangxiRadicals }, // 58,
213*cdf0e10cSrcweir     { UnicodeScript_kIdeographicDescriptionCharacters,
214*cdf0e10cSrcweir       UnicodeScript_kIdeographicDescriptionCharacters,
215*cdf0e10cSrcweir       UnicodeScript_kIdeographicDescriptionCharacters }, // 59,
216*cdf0e10cSrcweir     { UnicodeScript_kCJKSymbolPunctuation,
217*cdf0e10cSrcweir       UnicodeScript_kCJKSymbolPunctuation,
218*cdf0e10cSrcweir       UnicodeScript_kCJKSymbolPunctuation }, // 60,
219*cdf0e10cSrcweir     { UnicodeScript_kHiragana,
220*cdf0e10cSrcweir       UnicodeScript_kHiragana,
221*cdf0e10cSrcweir       UnicodeScript_kHiragana }, // 61,
222*cdf0e10cSrcweir     { UnicodeScript_kKatakana,
223*cdf0e10cSrcweir       UnicodeScript_kKatakana,
224*cdf0e10cSrcweir       UnicodeScript_kKatakana }, // 62,
225*cdf0e10cSrcweir     { UnicodeScript_kBopomofo,
226*cdf0e10cSrcweir       UnicodeScript_kBopomofo,
227*cdf0e10cSrcweir       UnicodeScript_kBopomofo }, // 63,
228*cdf0e10cSrcweir     { UnicodeScript_kHangulCompatibilityJamo,
229*cdf0e10cSrcweir       UnicodeScript_kHangulCompatibilityJamo,
230*cdf0e10cSrcweir       UnicodeScript_kHangulCompatibilityJamo }, // 64,
231*cdf0e10cSrcweir     { UnicodeScript_kKanbun,
232*cdf0e10cSrcweir       UnicodeScript_kKanbun,
233*cdf0e10cSrcweir       UnicodeScript_kKanbun }, // 65,
234*cdf0e10cSrcweir     { UnicodeScript_kBopomofoExtended,
235*cdf0e10cSrcweir       UnicodeScript_kBopomofoExtended,
236*cdf0e10cSrcweir       UnicodeScript_kBopomofoExtended }, // 66,
237*cdf0e10cSrcweir     { UnicodeScript_kEnclosedCJKLetterMonth,
238*cdf0e10cSrcweir       UnicodeScript_kEnclosedCJKLetterMonth,
239*cdf0e10cSrcweir       UnicodeScript_kEnclosedCJKLetterMonth }, // 67,
240*cdf0e10cSrcweir     { UnicodeScript_kCJKCompatibility,
241*cdf0e10cSrcweir       UnicodeScript_kCJKCompatibility,
242*cdf0e10cSrcweir       UnicodeScript_kCJKCompatibility }, // 68,
243*cdf0e10cSrcweir     { UnicodeScript_k_CJKUnifiedIdeographsExtensionA,
244*cdf0e10cSrcweir       UnicodeScript_k_CJKUnifiedIdeographsExtensionA,
245*cdf0e10cSrcweir       UnicodeScript_k_CJKUnifiedIdeographsExtensionA }, // 69,
246*cdf0e10cSrcweir     { UnicodeScript_kCJKUnifiedIdeograph,
247*cdf0e10cSrcweir       UnicodeScript_kCJKUnifiedIdeograph,
248*cdf0e10cSrcweir       UnicodeScript_kCJKUnifiedIdeograph }, // 70,
249*cdf0e10cSrcweir     { UnicodeScript_kYiSyllables,
250*cdf0e10cSrcweir       UnicodeScript_kYiSyllables,
251*cdf0e10cSrcweir       UnicodeScript_kYiSyllables }, // 71,
252*cdf0e10cSrcweir     { UnicodeScript_kYiRadicals,
253*cdf0e10cSrcweir       UnicodeScript_kYiRadicals,
254*cdf0e10cSrcweir       UnicodeScript_kYiRadicals }, // 72,
255*cdf0e10cSrcweir     { UnicodeScript_kHangulSyllable,
256*cdf0e10cSrcweir       UnicodeScript_kHangulSyllable,
257*cdf0e10cSrcweir       UnicodeScript_kHangulSyllable }, // 73,
258*cdf0e10cSrcweir     { UnicodeScript_kHighSurrogate,
259*cdf0e10cSrcweir       UnicodeScript_kHighSurrogate,
260*cdf0e10cSrcweir       UnicodeScript_kHighSurrogate }, // 74,
261*cdf0e10cSrcweir     { UnicodeScript_kHighPrivateUseSurrogate,
262*cdf0e10cSrcweir       UnicodeScript_kHighPrivateUseSurrogate,
263*cdf0e10cSrcweir       UnicodeScript_kHighPrivateUseSurrogate }, // 75,
264*cdf0e10cSrcweir     { UnicodeScript_kLowSurrogate,
265*cdf0e10cSrcweir       UnicodeScript_kLowSurrogate,
266*cdf0e10cSrcweir       UnicodeScript_kLowSurrogate }, // 76,
267*cdf0e10cSrcweir     { UnicodeScript_kPrivateUse,
268*cdf0e10cSrcweir       UnicodeScript_kPrivateUse,
269*cdf0e10cSrcweir       UnicodeScript_kPrivateUse }, // 77,
270*cdf0e10cSrcweir     { UnicodeScript_kCJKCompatibilityIdeograph,
271*cdf0e10cSrcweir       UnicodeScript_kCJKCompatibilityIdeograph,
272*cdf0e10cSrcweir       UnicodeScript_kCJKCompatibilityIdeograph }, // 78,
273*cdf0e10cSrcweir     { UnicodeScript_kAlphabeticPresentation,
274*cdf0e10cSrcweir       UnicodeScript_kAlphabeticPresentation,
275*cdf0e10cSrcweir       UnicodeScript_kAlphabeticPresentation }, // 79,
276*cdf0e10cSrcweir     { UnicodeScript_kArabicPresentationA,
277*cdf0e10cSrcweir       UnicodeScript_kArabicPresentationA,
278*cdf0e10cSrcweir       UnicodeScript_kArabicPresentationA }, // 80,
279*cdf0e10cSrcweir     { UnicodeScript_kCombiningHalfMark,
280*cdf0e10cSrcweir       UnicodeScript_kCombiningHalfMark,
281*cdf0e10cSrcweir       UnicodeScript_kCombiningHalfMark }, // 81,
282*cdf0e10cSrcweir     { UnicodeScript_kCJKCompatibilityForm,
283*cdf0e10cSrcweir       UnicodeScript_kCJKCompatibilityForm,
284*cdf0e10cSrcweir       UnicodeScript_kCJKCompatibilityForm }, // 82,
285*cdf0e10cSrcweir     { UnicodeScript_kSmallFormVariant,
286*cdf0e10cSrcweir       UnicodeScript_kSmallFormVariant,
287*cdf0e10cSrcweir       UnicodeScript_kSmallFormVariant }, // 83,
288*cdf0e10cSrcweir     { UnicodeScript_kArabicPresentationB,
289*cdf0e10cSrcweir       UnicodeScript_kArabicPresentationB,
290*cdf0e10cSrcweir       UnicodeScript_kArabicPresentationB }, // 84,
291*cdf0e10cSrcweir     { UnicodeScript_kNoScript,
292*cdf0e10cSrcweir       UnicodeScript_kNoScript,
293*cdf0e10cSrcweir       UnicodeScript_kNoScript }, // 85,
294*cdf0e10cSrcweir     { UnicodeScript_kHalfwidthFullwidthForm,
295*cdf0e10cSrcweir       UnicodeScript_kHalfwidthFullwidthForm,
296*cdf0e10cSrcweir       UnicodeScript_kHalfwidthFullwidthForm }, // 86,
297*cdf0e10cSrcweir     { UnicodeScript_kScriptCount,
298*cdf0e10cSrcweir       UnicodeScript_kScriptCount,
299*cdf0e10cSrcweir       UnicodeScript_kNoScript } // 87,
300*cdf0e10cSrcweir };
301*cdf0e10cSrcweir 
302*cdf0e10cSrcweir sal_Int16 SAL_CALL
303*cdf0e10cSrcweir unicode::getUnicodeScriptType( const sal_Unicode ch, ScriptTypeList* typeList, sal_Int16 unknownType ) {
304*cdf0e10cSrcweir 
305*cdf0e10cSrcweir     if (!typeList) {
306*cdf0e10cSrcweir         typeList = defaultTypeList;
307*cdf0e10cSrcweir         unknownType = UnicodeScript_kNoScript;
308*cdf0e10cSrcweir     }
309*cdf0e10cSrcweir 
310*cdf0e10cSrcweir     sal_Int16 i = 0, type = typeList[0].to;
311*cdf0e10cSrcweir     while (type < UnicodeScript_kScriptCount && ch > UnicodeScriptType[type][UnicodeScriptTypeTo]) {
312*cdf0e10cSrcweir         type = typeList[++i].to;
313*cdf0e10cSrcweir     }
314*cdf0e10cSrcweir 
315*cdf0e10cSrcweir     return (type < UnicodeScript_kScriptCount &&
316*cdf0e10cSrcweir             ch >= UnicodeScriptType[typeList[i].from][UnicodeScriptTypeFrom]) ?
317*cdf0e10cSrcweir             typeList[i].value : unknownType;
318*cdf0e10cSrcweir }
319*cdf0e10cSrcweir 
320*cdf0e10cSrcweir sal_Bool SAL_CALL
321*cdf0e10cSrcweir unicode::isUnicodeScriptType( const sal_Unicode ch, sal_Int16 type) {
322*cdf0e10cSrcweir     return ch >= UnicodeScriptType[type][UnicodeScriptTypeFrom] &&
323*cdf0e10cSrcweir         ch <= UnicodeScriptType[type][UnicodeScriptTypeTo];
324*cdf0e10cSrcweir }
325*cdf0e10cSrcweir 
326*cdf0e10cSrcweir sal_Unicode SAL_CALL
327*cdf0e10cSrcweir unicode::getUnicodeScriptStart( UnicodeScript type) {
328*cdf0e10cSrcweir     return UnicodeScriptType[type][UnicodeScriptTypeFrom];
329*cdf0e10cSrcweir }
330*cdf0e10cSrcweir 
331*cdf0e10cSrcweir sal_Unicode SAL_CALL
332*cdf0e10cSrcweir unicode::getUnicodeScriptEnd( UnicodeScript type) {
333*cdf0e10cSrcweir     return UnicodeScriptType[type][UnicodeScriptTypeTo];
334*cdf0e10cSrcweir }
335*cdf0e10cSrcweir 
336*cdf0e10cSrcweir sal_Int16 SAL_CALL
337*cdf0e10cSrcweir unicode::getUnicodeType( const sal_Unicode ch ) {
338*cdf0e10cSrcweir     static sal_Unicode c = 0x00;
339*cdf0e10cSrcweir     static sal_Int16 r = 0x00;
340*cdf0e10cSrcweir 
341*cdf0e10cSrcweir     if (ch == c) return r;
342*cdf0e10cSrcweir     else c = ch;
343*cdf0e10cSrcweir 
344*cdf0e10cSrcweir     sal_Int16 address = UnicodeTypeIndex[ch >> 8];
345*cdf0e10cSrcweir     return r = (sal_Int16)((address < UnicodeTypeNumberBlock) ? UnicodeTypeBlockValue[address] :
346*cdf0e10cSrcweir         UnicodeTypeValue[((address - UnicodeTypeNumberBlock) << 8) + (ch & 0xff)]);
347*cdf0e10cSrcweir }
348*cdf0e10cSrcweir 
349*cdf0e10cSrcweir sal_uInt8 SAL_CALL
350*cdf0e10cSrcweir unicode::getUnicodeDirection( const sal_Unicode ch ) {
351*cdf0e10cSrcweir     static sal_Unicode c = 0x00;
352*cdf0e10cSrcweir     static sal_uInt8 r = 0x00;
353*cdf0e10cSrcweir 
354*cdf0e10cSrcweir     if (ch == c) return r;
355*cdf0e10cSrcweir     else c = ch;
356*cdf0e10cSrcweir 
357*cdf0e10cSrcweir     sal_Int16 address = UnicodeDirectionIndex[ch >> 8];
358*cdf0e10cSrcweir     return r = ((address < UnicodeDirectionNumberBlock) ? UnicodeDirectionBlockValue[address] :
359*cdf0e10cSrcweir         UnicodeDirectionValue[((address - UnicodeDirectionNumberBlock) << 8) + (ch & 0xff)]);
360*cdf0e10cSrcweir 
361*cdf0e10cSrcweir }
362*cdf0e10cSrcweir 
363*cdf0e10cSrcweir #define bit(name)   (1 << name)
364*cdf0e10cSrcweir 
365*cdf0e10cSrcweir #define UPPERMASK   bit(UnicodeType::UPPERCASE_LETTER)
366*cdf0e10cSrcweir 
367*cdf0e10cSrcweir #define LOWERMASK   bit(UnicodeType::LOWERCASE_LETTER)
368*cdf0e10cSrcweir 
369*cdf0e10cSrcweir #define TITLEMASK   bit(UnicodeType::TITLECASE_LETTER)
370*cdf0e10cSrcweir 
371*cdf0e10cSrcweir #define DIGITMASK   bit(UnicodeType::DECIMAL_DIGIT_NUMBER)|\
372*cdf0e10cSrcweir             bit(UnicodeType::LETTER_NUMBER)|\
373*cdf0e10cSrcweir             bit(UnicodeType::OTHER_NUMBER)
374*cdf0e10cSrcweir 
375*cdf0e10cSrcweir #define ALPHAMASK   UPPERMASK|LOWERMASK|TITLEMASK|\
376*cdf0e10cSrcweir             bit(UnicodeType::MODIFIER_LETTER)|\
377*cdf0e10cSrcweir             bit(UnicodeType::OTHER_LETTER)
378*cdf0e10cSrcweir 
379*cdf0e10cSrcweir #define BASEMASK    DIGITMASK|ALPHAMASK|\
380*cdf0e10cSrcweir             bit(UnicodeType::NON_SPACING_MARK)|\
381*cdf0e10cSrcweir             bit(UnicodeType::ENCLOSING_MARK)|\
382*cdf0e10cSrcweir             bit(UnicodeType::COMBINING_SPACING_MARK)
383*cdf0e10cSrcweir 
384*cdf0e10cSrcweir #define SPACEMASK   bit(UnicodeType::SPACE_SEPARATOR)|\
385*cdf0e10cSrcweir             bit(UnicodeType::LINE_SEPARATOR)|\
386*cdf0e10cSrcweir             bit(UnicodeType::PARAGRAPH_SEPARATOR)
387*cdf0e10cSrcweir 
388*cdf0e10cSrcweir #define PUNCTUATIONMASK bit(UnicodeType::DASH_PUNCTUATION)|\
389*cdf0e10cSrcweir             bit(UnicodeType::INITIAL_PUNCTUATION)|\
390*cdf0e10cSrcweir             bit(UnicodeType::FINAL_PUNCTUATION)|\
391*cdf0e10cSrcweir             bit(UnicodeType::CONNECTOR_PUNCTUATION)|\
392*cdf0e10cSrcweir             bit(UnicodeType::OTHER_PUNCTUATION)
393*cdf0e10cSrcweir 
394*cdf0e10cSrcweir #define SYMBOLMASK  bit(UnicodeType::MATH_SYMBOL)|\
395*cdf0e10cSrcweir             bit(UnicodeType::CURRENCY_SYMBOL)|\
396*cdf0e10cSrcweir             bit(UnicodeType::MODIFIER_SYMBOL)|\
397*cdf0e10cSrcweir             bit(UnicodeType::OTHER_SYMBOL)
398*cdf0e10cSrcweir 
399*cdf0e10cSrcweir #define PRINTMASK   BASEMASK|SPACEMASK|PUNCTUATIONMASK|SYMBOLMASK
400*cdf0e10cSrcweir 
401*cdf0e10cSrcweir #define CONTROLMASK bit(UnicodeType::CONTROL)|\
402*cdf0e10cSrcweir             bit(UnicodeType::FORMAT)|\
403*cdf0e10cSrcweir             bit(UnicodeType::LINE_SEPARATOR)|\
404*cdf0e10cSrcweir             bit(UnicodeType::PARAGRAPH_SEPARATOR)
405*cdf0e10cSrcweir 
406*cdf0e10cSrcweir #define IsType(func, mask)  \
407*cdf0e10cSrcweir sal_Bool SAL_CALL func( const sal_Unicode ch) {\
408*cdf0e10cSrcweir     return (bit(getUnicodeType(ch)) & (mask)) != 0;\
409*cdf0e10cSrcweir }
410*cdf0e10cSrcweir 
411*cdf0e10cSrcweir IsType(unicode::isUpper, UPPERMASK)
412*cdf0e10cSrcweir IsType(unicode::isLower, LOWERMASK)
413*cdf0e10cSrcweir IsType(unicode::isTitle, DIGITMASK)
414*cdf0e10cSrcweir IsType(unicode::isControl, CONTROLMASK)
415*cdf0e10cSrcweir IsType(unicode::isPrint, PRINTMASK)
416*cdf0e10cSrcweir IsType(unicode::isAlpha, ALPHAMASK)
417*cdf0e10cSrcweir IsType(unicode::isDigit, DIGITMASK)
418*cdf0e10cSrcweir IsType(unicode::isAlphaDigit, ALPHAMASK|DIGITMASK)
419*cdf0e10cSrcweir IsType(unicode::isSpace, SPACEMASK)
420*cdf0e10cSrcweir IsType(unicode::isBase, BASEMASK)
421*cdf0e10cSrcweir IsType(unicode::isPunctuation, PUNCTUATIONMASK)
422*cdf0e10cSrcweir 
423*cdf0e10cSrcweir #define CONTROLSPACE    bit(0x09)|bit(0x0a)|bit(0x0b)|bit(0x0c)|bit(0x0d)|\
424*cdf0e10cSrcweir             bit(0x1c)|bit(0x1d)|bit(0x1e)|bit(0x1f)
425*cdf0e10cSrcweir 
426*cdf0e10cSrcweir sal_Bool SAL_CALL unicode::isWhiteSpace( const sal_Unicode ch) {
427*cdf0e10cSrcweir     return (ch != 0xa0 && isSpace(ch)) || (ch <= 0x1F && (bit(ch) & (CONTROLSPACE)));
428*cdf0e10cSrcweir }
429*cdf0e10cSrcweir 
430*cdf0e10cSrcweir sal_Int32 SAL_CALL unicode::getCharType( const sal_Unicode ch )
431*cdf0e10cSrcweir {
432*cdf0e10cSrcweir     using namespace ::com::sun::star::i18n::KCharacterType;
433*cdf0e10cSrcweir 
434*cdf0e10cSrcweir     switch ( getUnicodeType( ch ) ) {
435*cdf0e10cSrcweir     // Upper
436*cdf0e10cSrcweir     case UnicodeType::UPPERCASE_LETTER :
437*cdf0e10cSrcweir         return UPPER|LETTER|PRINTABLE|BASE_FORM;
438*cdf0e10cSrcweir 
439*cdf0e10cSrcweir     // Lower
440*cdf0e10cSrcweir     case UnicodeType::LOWERCASE_LETTER :
441*cdf0e10cSrcweir         return LOWER|LETTER|PRINTABLE|BASE_FORM;
442*cdf0e10cSrcweir 
443*cdf0e10cSrcweir     // Title
444*cdf0e10cSrcweir     case UnicodeType::TITLECASE_LETTER :
445*cdf0e10cSrcweir         return TITLE_CASE|LETTER|PRINTABLE|BASE_FORM;
446*cdf0e10cSrcweir 
447*cdf0e10cSrcweir     // Letter
448*cdf0e10cSrcweir     case UnicodeType::MODIFIER_LETTER :
449*cdf0e10cSrcweir     case UnicodeType::OTHER_LETTER :
450*cdf0e10cSrcweir         return LETTER|PRINTABLE|BASE_FORM;
451*cdf0e10cSrcweir 
452*cdf0e10cSrcweir     // Digit
453*cdf0e10cSrcweir     case UnicodeType::DECIMAL_DIGIT_NUMBER:
454*cdf0e10cSrcweir     case UnicodeType::LETTER_NUMBER:
455*cdf0e10cSrcweir     case UnicodeType::OTHER_NUMBER:
456*cdf0e10cSrcweir         return DIGIT|PRINTABLE|BASE_FORM;
457*cdf0e10cSrcweir 
458*cdf0e10cSrcweir     // Base
459*cdf0e10cSrcweir     case UnicodeType::NON_SPACING_MARK:
460*cdf0e10cSrcweir     case UnicodeType::ENCLOSING_MARK:
461*cdf0e10cSrcweir     case UnicodeType::COMBINING_SPACING_MARK:
462*cdf0e10cSrcweir         return BASE_FORM|PRINTABLE;
463*cdf0e10cSrcweir 
464*cdf0e10cSrcweir     // Print
465*cdf0e10cSrcweir     case UnicodeType::SPACE_SEPARATOR:
466*cdf0e10cSrcweir 
467*cdf0e10cSrcweir     case UnicodeType::DASH_PUNCTUATION:
468*cdf0e10cSrcweir     case UnicodeType::INITIAL_PUNCTUATION:
469*cdf0e10cSrcweir     case UnicodeType::FINAL_PUNCTUATION:
470*cdf0e10cSrcweir     case UnicodeType::CONNECTOR_PUNCTUATION:
471*cdf0e10cSrcweir     case UnicodeType::OTHER_PUNCTUATION:
472*cdf0e10cSrcweir 
473*cdf0e10cSrcweir     case UnicodeType::MATH_SYMBOL:
474*cdf0e10cSrcweir     case UnicodeType::CURRENCY_SYMBOL:
475*cdf0e10cSrcweir     case UnicodeType::MODIFIER_SYMBOL:
476*cdf0e10cSrcweir     case UnicodeType::OTHER_SYMBOL:
477*cdf0e10cSrcweir         return PRINTABLE;
478*cdf0e10cSrcweir 
479*cdf0e10cSrcweir     // Control
480*cdf0e10cSrcweir     case UnicodeType::CONTROL:
481*cdf0e10cSrcweir     case UnicodeType::FORMAT:
482*cdf0e10cSrcweir         return CONTROL;
483*cdf0e10cSrcweir 
484*cdf0e10cSrcweir     case UnicodeType::LINE_SEPARATOR:
485*cdf0e10cSrcweir     case UnicodeType::PARAGRAPH_SEPARATOR:
486*cdf0e10cSrcweir         return CONTROL|PRINTABLE;
487*cdf0e10cSrcweir 
488*cdf0e10cSrcweir     // for all others
489*cdf0e10cSrcweir     default:
490*cdf0e10cSrcweir         return 0;
491*cdf0e10cSrcweir     }
492*cdf0e10cSrcweir }
493*cdf0e10cSrcweir 
494*cdf0e10cSrcweir 
495