1 /************************************************************** 2 * 3 * Licensed to the Apache Software Foundation (ASF) under one 4 * or more contributor license agreements. See the NOTICE file 5 * distributed with this work for additional information 6 * regarding copyright ownership. The ASF licenses this file 7 * to you under the Apache License, Version 2.0 (the 8 * "License"); you may not use this file except in compliance 9 * with the License. You may obtain a copy of the License at 10 * 11 * http://www.apache.org/licenses/LICENSE-2.0 12 * 13 * Unless required by applicable law or agreed to in writing, 14 * software distributed under the License is distributed on an 15 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 * KIND, either express or implied. See the License for the 17 * specific language governing permissions and limitations 18 * under the License. 19 * 20 *************************************************************/ 21 22 23 24 #include <com/sun/star/i18n/UnicodeType.hpp> 25 #include <com/sun/star/i18n/KCharacterType.hpp> 26 #include <i18nutil/unicode.hxx> 27 #include "unicode_data.h" 28 29 using namespace ::com::sun::star::i18n; 30 31 static ScriptTypeList defaultTypeList[] = { 32 { UnicodeScript_kBasicLatin, 33 UnicodeScript_kBasicLatin, 34 UnicodeScript_kBasicLatin }, // 0, 35 { UnicodeScript_kLatin1Supplement, 36 UnicodeScript_kLatin1Supplement, 37 UnicodeScript_kLatin1Supplement },// 1, 38 { UnicodeScript_kLatinExtendedA, 39 UnicodeScript_kLatinExtendedA, 40 UnicodeScript_kLatinExtendedA }, // 2, 41 { UnicodeScript_kLatinExtendedB, 42 UnicodeScript_kLatinExtendedB, 43 UnicodeScript_kLatinExtendedB }, // 3, 44 { UnicodeScript_kIPAExtension, 45 UnicodeScript_kIPAExtension, 46 UnicodeScript_kIPAExtension }, // 4, 47 { UnicodeScript_kSpacingModifier, 48 UnicodeScript_kSpacingModifier, 49 UnicodeScript_kSpacingModifier }, // 5, 50 { UnicodeScript_kCombiningDiacritical, 51 UnicodeScript_kCombiningDiacritical, 52 UnicodeScript_kCombiningDiacritical }, // 6, 53 { UnicodeScript_kGreek, 54 UnicodeScript_kGreek, 55 UnicodeScript_kGreek }, // 7, 56 { UnicodeScript_kCyrillic, 57 UnicodeScript_kCyrillic, 58 UnicodeScript_kCyrillic }, // 8, 59 { UnicodeScript_kArmenian, 60 UnicodeScript_kArmenian, 61 UnicodeScript_kArmenian }, // 9, 62 { UnicodeScript_kHebrew, 63 UnicodeScript_kHebrew, 64 UnicodeScript_kHebrew }, // 10, 65 { UnicodeScript_kArabic, 66 UnicodeScript_kArabic, 67 UnicodeScript_kArabic }, // 11, 68 { UnicodeScript_kSyriac, 69 UnicodeScript_kSyriac, 70 UnicodeScript_kSyriac }, // 12, 71 { UnicodeScript_kThaana, 72 UnicodeScript_kThaana, 73 UnicodeScript_kThaana }, // 13, 74 { UnicodeScript_kDevanagari, 75 UnicodeScript_kDevanagari, 76 UnicodeScript_kDevanagari }, // 14, 77 { UnicodeScript_kBengali, 78 UnicodeScript_kBengali, 79 UnicodeScript_kBengali }, // 15, 80 { UnicodeScript_kGurmukhi, 81 UnicodeScript_kGurmukhi, 82 UnicodeScript_kGurmukhi }, // 16, 83 { UnicodeScript_kGujarati, 84 UnicodeScript_kGujarati, 85 UnicodeScript_kGujarati }, // 17, 86 { UnicodeScript_kOriya, 87 UnicodeScript_kOriya, 88 UnicodeScript_kOriya }, // 18, 89 { UnicodeScript_kTamil, 90 UnicodeScript_kTamil, 91 UnicodeScript_kTamil }, // 19, 92 { UnicodeScript_kTelugu, 93 UnicodeScript_kTelugu, 94 UnicodeScript_kTelugu }, // 20, 95 { UnicodeScript_kKannada, 96 UnicodeScript_kKannada, 97 UnicodeScript_kKannada }, // 21, 98 { UnicodeScript_kMalayalam, 99 UnicodeScript_kMalayalam, 100 UnicodeScript_kMalayalam }, // 22, 101 { UnicodeScript_kSinhala, 102 UnicodeScript_kSinhala, 103 UnicodeScript_kSinhala }, // 23, 104 { UnicodeScript_kThai, 105 UnicodeScript_kThai, 106 UnicodeScript_kThai }, // 24, 107 { UnicodeScript_kLao, 108 UnicodeScript_kLao, 109 UnicodeScript_kLao }, // 25, 110 { UnicodeScript_kTibetan, 111 UnicodeScript_kTibetan, 112 UnicodeScript_kTibetan }, // 26, 113 { UnicodeScript_kMyanmar, 114 UnicodeScript_kMyanmar, 115 UnicodeScript_kMyanmar }, // 27, 116 { UnicodeScript_kGeorgian, 117 UnicodeScript_kGeorgian, 118 UnicodeScript_kGeorgian }, // 28, 119 { UnicodeScript_kHangulJamo, 120 UnicodeScript_kHangulJamo, 121 UnicodeScript_kHangulJamo }, // 29, 122 { UnicodeScript_kEthiopic, 123 UnicodeScript_kEthiopic, 124 UnicodeScript_kEthiopic }, // 30, 125 { UnicodeScript_kCherokee, 126 UnicodeScript_kCherokee, 127 UnicodeScript_kCherokee }, // 31, 128 { UnicodeScript_kUnifiedCanadianAboriginalSyllabics, 129 UnicodeScript_kUnifiedCanadianAboriginalSyllabics, 130 UnicodeScript_kUnifiedCanadianAboriginalSyllabics }, // 32, 131 { UnicodeScript_kOgham, 132 UnicodeScript_kOgham, 133 UnicodeScript_kOgham }, // 33, 134 { UnicodeScript_kRunic, 135 UnicodeScript_kRunic, 136 UnicodeScript_kRunic }, // 34, 137 { UnicodeScript_kKhmer, 138 UnicodeScript_kKhmer, 139 UnicodeScript_kKhmer }, // 35, 140 { UnicodeScript_kMongolian, 141 UnicodeScript_kMongolian, 142 UnicodeScript_kMongolian }, // 36, 143 { UnicodeScript_kLatinExtendedAdditional, 144 UnicodeScript_kLatinExtendedAdditional, 145 UnicodeScript_kLatinExtendedAdditional }, // 37, 146 { UnicodeScript_kGreekExtended, 147 UnicodeScript_kGreekExtended, 148 UnicodeScript_kGreekExtended }, // 38, 149 { UnicodeScript_kGeneralPunctuation, 150 UnicodeScript_kGeneralPunctuation, 151 UnicodeScript_kGeneralPunctuation }, // 39, 152 { UnicodeScript_kSuperSubScript, 153 UnicodeScript_kSuperSubScript, 154 UnicodeScript_kSuperSubScript }, // 40, 155 { UnicodeScript_kCurrencySymbolScript, 156 UnicodeScript_kCurrencySymbolScript, 157 UnicodeScript_kCurrencySymbolScript }, // 41, 158 { UnicodeScript_kSymbolCombiningMark, 159 UnicodeScript_kSymbolCombiningMark, 160 UnicodeScript_kSymbolCombiningMark }, // 42, 161 { UnicodeScript_kLetterlikeSymbol, 162 UnicodeScript_kLetterlikeSymbol, 163 UnicodeScript_kLetterlikeSymbol }, // 43, 164 { UnicodeScript_kNumberForm, 165 UnicodeScript_kNumberForm, 166 UnicodeScript_kNumberForm }, // 44, 167 { UnicodeScript_kArrow, 168 UnicodeScript_kArrow, 169 UnicodeScript_kArrow }, // 45, 170 { UnicodeScript_kMathOperator, 171 UnicodeScript_kMathOperator, 172 UnicodeScript_kMathOperator }, // 46, 173 { UnicodeScript_kMiscTechnical, 174 UnicodeScript_kMiscTechnical, 175 UnicodeScript_kMiscTechnical }, // 47, 176 { UnicodeScript_kControlPicture, 177 UnicodeScript_kControlPicture, 178 UnicodeScript_kControlPicture }, // 48, 179 { UnicodeScript_kOpticalCharacter, 180 UnicodeScript_kOpticalCharacter, 181 UnicodeScript_kOpticalCharacter }, // 49, 182 { UnicodeScript_kEnclosedAlphanumeric, 183 UnicodeScript_kEnclosedAlphanumeric, 184 UnicodeScript_kEnclosedAlphanumeric }, // 50, 185 { UnicodeScript_kBoxDrawing, 186 UnicodeScript_kBoxDrawing, 187 UnicodeScript_kBoxDrawing }, // 51, 188 { UnicodeScript_kBlockElement, 189 UnicodeScript_kBlockElement, 190 UnicodeScript_kBlockElement }, // 52, 191 { UnicodeScript_kGeometricShape, 192 UnicodeScript_kGeometricShape, 193 UnicodeScript_kGeometricShape }, // 53, 194 { UnicodeScript_kMiscSymbol, 195 UnicodeScript_kMiscSymbol, 196 UnicodeScript_kMiscSymbol }, // 54, 197 { UnicodeScript_kDingbat, 198 UnicodeScript_kDingbat, 199 UnicodeScript_kDingbat }, // 55, 200 { UnicodeScript_kBraillePatterns, 201 UnicodeScript_kBraillePatterns, 202 UnicodeScript_kBraillePatterns }, // 56, 203 { UnicodeScript_kCJKRadicalsSupplement, 204 UnicodeScript_kCJKRadicalsSupplement, 205 UnicodeScript_kCJKRadicalsSupplement }, // 57, 206 { UnicodeScript_kKangxiRadicals, 207 UnicodeScript_kKangxiRadicals, 208 UnicodeScript_kKangxiRadicals }, // 58, 209 { UnicodeScript_kIdeographicDescriptionCharacters, 210 UnicodeScript_kIdeographicDescriptionCharacters, 211 UnicodeScript_kIdeographicDescriptionCharacters }, // 59, 212 { UnicodeScript_kCJKSymbolPunctuation, 213 UnicodeScript_kCJKSymbolPunctuation, 214 UnicodeScript_kCJKSymbolPunctuation }, // 60, 215 { UnicodeScript_kHiragana, 216 UnicodeScript_kHiragana, 217 UnicodeScript_kHiragana }, // 61, 218 { UnicodeScript_kKatakana, 219 UnicodeScript_kKatakana, 220 UnicodeScript_kKatakana }, // 62, 221 { UnicodeScript_kBopomofo, 222 UnicodeScript_kBopomofo, 223 UnicodeScript_kBopomofo }, // 63, 224 { UnicodeScript_kHangulCompatibilityJamo, 225 UnicodeScript_kHangulCompatibilityJamo, 226 UnicodeScript_kHangulCompatibilityJamo }, // 64, 227 { UnicodeScript_kKanbun, 228 UnicodeScript_kKanbun, 229 UnicodeScript_kKanbun }, // 65, 230 { UnicodeScript_kBopomofoExtended, 231 UnicodeScript_kBopomofoExtended, 232 UnicodeScript_kBopomofoExtended }, // 66, 233 { UnicodeScript_kEnclosedCJKLetterMonth, 234 UnicodeScript_kEnclosedCJKLetterMonth, 235 UnicodeScript_kEnclosedCJKLetterMonth }, // 67, 236 { UnicodeScript_kCJKCompatibility, 237 UnicodeScript_kCJKCompatibility, 238 UnicodeScript_kCJKCompatibility }, // 68, 239 { UnicodeScript_k_CJKUnifiedIdeographsExtensionA, 240 UnicodeScript_k_CJKUnifiedIdeographsExtensionA, 241 UnicodeScript_k_CJKUnifiedIdeographsExtensionA }, // 69, 242 { UnicodeScript_kCJKUnifiedIdeograph, 243 UnicodeScript_kCJKUnifiedIdeograph, 244 UnicodeScript_kCJKUnifiedIdeograph }, // 70, 245 { UnicodeScript_kYiSyllables, 246 UnicodeScript_kYiSyllables, 247 UnicodeScript_kYiSyllables }, // 71, 248 { UnicodeScript_kYiRadicals, 249 UnicodeScript_kYiRadicals, 250 UnicodeScript_kYiRadicals }, // 72, 251 { UnicodeScript_kHangulSyllable, 252 UnicodeScript_kHangulSyllable, 253 UnicodeScript_kHangulSyllable }, // 73, 254 { UnicodeScript_kHighSurrogate, 255 UnicodeScript_kHighSurrogate, 256 UnicodeScript_kHighSurrogate }, // 74, 257 { UnicodeScript_kHighPrivateUseSurrogate, 258 UnicodeScript_kHighPrivateUseSurrogate, 259 UnicodeScript_kHighPrivateUseSurrogate }, // 75, 260 { UnicodeScript_kLowSurrogate, 261 UnicodeScript_kLowSurrogate, 262 UnicodeScript_kLowSurrogate }, // 76, 263 { UnicodeScript_kPrivateUse, 264 UnicodeScript_kPrivateUse, 265 UnicodeScript_kPrivateUse }, // 77, 266 { UnicodeScript_kCJKCompatibilityIdeograph, 267 UnicodeScript_kCJKCompatibilityIdeograph, 268 UnicodeScript_kCJKCompatibilityIdeograph }, // 78, 269 { UnicodeScript_kAlphabeticPresentation, 270 UnicodeScript_kAlphabeticPresentation, 271 UnicodeScript_kAlphabeticPresentation }, // 79, 272 { UnicodeScript_kArabicPresentationA, 273 UnicodeScript_kArabicPresentationA, 274 UnicodeScript_kArabicPresentationA }, // 80, 275 { UnicodeScript_kCombiningHalfMark, 276 UnicodeScript_kCombiningHalfMark, 277 UnicodeScript_kCombiningHalfMark }, // 81, 278 { UnicodeScript_kCJKCompatibilityForm, 279 UnicodeScript_kCJKCompatibilityForm, 280 UnicodeScript_kCJKCompatibilityForm }, // 82, 281 { UnicodeScript_kSmallFormVariant, 282 UnicodeScript_kSmallFormVariant, 283 UnicodeScript_kSmallFormVariant }, // 83, 284 { UnicodeScript_kArabicPresentationB, 285 UnicodeScript_kArabicPresentationB, 286 UnicodeScript_kArabicPresentationB }, // 84, 287 { UnicodeScript_kNoScript, 288 UnicodeScript_kNoScript, 289 UnicodeScript_kNoScript }, // 85, 290 { UnicodeScript_kHalfwidthFullwidthForm, 291 UnicodeScript_kHalfwidthFullwidthForm, 292 UnicodeScript_kHalfwidthFullwidthForm }, // 86, 293 { UnicodeScript_kScriptCount, 294 UnicodeScript_kScriptCount, 295 UnicodeScript_kNoScript } // 87, 296 }; 297 298 sal_Int16 SAL_CALL 299 unicode::getUnicodeScriptType( const sal_Unicode ch, ScriptTypeList* typeList, sal_Int16 unknownType ) { 300 301 if (!typeList) { 302 typeList = defaultTypeList; 303 unknownType = UnicodeScript_kNoScript; 304 } 305 306 sal_Int16 i = 0, type = typeList[0].to; 307 while (type < UnicodeScript_kScriptCount && ch > UnicodeScriptType[type][UnicodeScriptTypeTo]) { 308 type = typeList[++i].to; 309 } 310 311 return (type < UnicodeScript_kScriptCount && 312 ch >= UnicodeScriptType[typeList[i].from][UnicodeScriptTypeFrom]) ? 313 typeList[i].value : unknownType; 314 } 315 316 sal_Bool SAL_CALL 317 unicode::isUnicodeScriptType( const sal_Unicode ch, sal_Int16 type) { 318 return ch >= UnicodeScriptType[type][UnicodeScriptTypeFrom] && 319 ch <= UnicodeScriptType[type][UnicodeScriptTypeTo]; 320 } 321 322 sal_Unicode SAL_CALL 323 unicode::getUnicodeScriptStart( UnicodeScript type) { 324 return UnicodeScriptType[type][UnicodeScriptTypeFrom]; 325 } 326 327 sal_Unicode SAL_CALL 328 unicode::getUnicodeScriptEnd( UnicodeScript type) { 329 return UnicodeScriptType[type][UnicodeScriptTypeTo]; 330 } 331 332 sal_Int16 SAL_CALL 333 unicode::getUnicodeType( const sal_Unicode ch ) { 334 static sal_Unicode c = 0x00; 335 static sal_Int16 r = 0x00; 336 337 if (ch == c) return r; 338 else c = ch; 339 340 sal_Int16 address = UnicodeTypeIndex[ch >> 8]; 341 return r = (sal_Int16)((address < UnicodeTypeNumberBlock) ? UnicodeTypeBlockValue[address] : 342 UnicodeTypeValue[((address - UnicodeTypeNumberBlock) << 8) + (ch & 0xff)]); 343 } 344 345 sal_uInt8 SAL_CALL 346 unicode::getUnicodeDirection( const sal_Unicode ch ) { 347 static sal_Unicode c = 0x00; 348 static sal_uInt8 r = 0x00; 349 350 if (ch == c) return r; 351 else c = ch; 352 353 sal_Int16 address = UnicodeDirectionIndex[ch >> 8]; 354 return r = ((address < UnicodeDirectionNumberBlock) ? UnicodeDirectionBlockValue[address] : 355 UnicodeDirectionValue[((address - UnicodeDirectionNumberBlock) << 8) + (ch & 0xff)]); 356 357 } 358 359 #define bit(name) (1 << name) 360 361 #define UPPERMASK bit(UnicodeType::UPPERCASE_LETTER) 362 363 #define LOWERMASK bit(UnicodeType::LOWERCASE_LETTER) 364 365 #define TITLEMASK bit(UnicodeType::TITLECASE_LETTER) 366 367 #define DIGITMASK bit(UnicodeType::DECIMAL_DIGIT_NUMBER)|\ 368 bit(UnicodeType::LETTER_NUMBER)|\ 369 bit(UnicodeType::OTHER_NUMBER) 370 371 #define ALPHAMASK UPPERMASK|LOWERMASK|TITLEMASK|\ 372 bit(UnicodeType::MODIFIER_LETTER)|\ 373 bit(UnicodeType::OTHER_LETTER) 374 375 #define BASEMASK DIGITMASK|ALPHAMASK|\ 376 bit(UnicodeType::NON_SPACING_MARK)|\ 377 bit(UnicodeType::ENCLOSING_MARK)|\ 378 bit(UnicodeType::COMBINING_SPACING_MARK) 379 380 #define SPACEMASK bit(UnicodeType::SPACE_SEPARATOR)|\ 381 bit(UnicodeType::LINE_SEPARATOR)|\ 382 bit(UnicodeType::PARAGRAPH_SEPARATOR) 383 384 #define PUNCTUATIONMASK bit(UnicodeType::DASH_PUNCTUATION)|\ 385 bit(UnicodeType::INITIAL_PUNCTUATION)|\ 386 bit(UnicodeType::FINAL_PUNCTUATION)|\ 387 bit(UnicodeType::CONNECTOR_PUNCTUATION)|\ 388 bit(UnicodeType::OTHER_PUNCTUATION) 389 390 #define SYMBOLMASK bit(UnicodeType::MATH_SYMBOL)|\ 391 bit(UnicodeType::CURRENCY_SYMBOL)|\ 392 bit(UnicodeType::MODIFIER_SYMBOL)|\ 393 bit(UnicodeType::OTHER_SYMBOL) 394 395 #define PRINTMASK BASEMASK|SPACEMASK|PUNCTUATIONMASK|SYMBOLMASK 396 397 #define CONTROLMASK bit(UnicodeType::CONTROL)|\ 398 bit(UnicodeType::FORMAT)|\ 399 bit(UnicodeType::LINE_SEPARATOR)|\ 400 bit(UnicodeType::PARAGRAPH_SEPARATOR) 401 402 #define IsType(func, mask) \ 403 sal_Bool SAL_CALL func( const sal_Unicode ch) {\ 404 return (bit(getUnicodeType(ch)) & (mask)) != 0;\ 405 } 406 407 IsType(unicode::isUpper, UPPERMASK) 408 IsType(unicode::isLower, LOWERMASK) 409 IsType(unicode::isTitle, DIGITMASK) 410 IsType(unicode::isControl, CONTROLMASK) 411 IsType(unicode::isPrint, PRINTMASK) 412 IsType(unicode::isAlpha, ALPHAMASK) 413 IsType(unicode::isDigit, DIGITMASK) 414 IsType(unicode::isAlphaDigit, ALPHAMASK|DIGITMASK) 415 IsType(unicode::isSpace, SPACEMASK) 416 IsType(unicode::isBase, BASEMASK) 417 IsType(unicode::isPunctuation, PUNCTUATIONMASK) 418 419 #define CONTROLSPACE bit(0x09)|bit(0x0a)|bit(0x0b)|bit(0x0c)|bit(0x0d)|\ 420 bit(0x1c)|bit(0x1d)|bit(0x1e)|bit(0x1f) 421 422 sal_Bool SAL_CALL unicode::isWhiteSpace( const sal_Unicode ch) { 423 return (ch != 0xa0 && isSpace(ch)) || (ch <= 0x1F && (bit(ch) & (CONTROLSPACE))); 424 } 425 426 sal_Int32 SAL_CALL unicode::getCharType( const sal_Unicode ch ) 427 { 428 using namespace ::com::sun::star::i18n::KCharacterType; 429 430 switch ( getUnicodeType( ch ) ) { 431 // Upper 432 case UnicodeType::UPPERCASE_LETTER : 433 return UPPER|LETTER|PRINTABLE|BASE_FORM; 434 435 // Lower 436 case UnicodeType::LOWERCASE_LETTER : 437 return LOWER|LETTER|PRINTABLE|BASE_FORM; 438 439 // Title 440 case UnicodeType::TITLECASE_LETTER : 441 return TITLE_CASE|LETTER|PRINTABLE|BASE_FORM; 442 443 // Letter 444 case UnicodeType::MODIFIER_LETTER : 445 case UnicodeType::OTHER_LETTER : 446 return LETTER|PRINTABLE|BASE_FORM; 447 448 // Digit 449 case UnicodeType::DECIMAL_DIGIT_NUMBER: 450 case UnicodeType::LETTER_NUMBER: 451 case UnicodeType::OTHER_NUMBER: 452 return DIGIT|PRINTABLE|BASE_FORM; 453 454 // Base 455 case UnicodeType::NON_SPACING_MARK: 456 case UnicodeType::ENCLOSING_MARK: 457 case UnicodeType::COMBINING_SPACING_MARK: 458 return BASE_FORM|PRINTABLE; 459 460 // Print 461 case UnicodeType::SPACE_SEPARATOR: 462 463 case UnicodeType::DASH_PUNCTUATION: 464 case UnicodeType::INITIAL_PUNCTUATION: 465 case UnicodeType::FINAL_PUNCTUATION: 466 case UnicodeType::CONNECTOR_PUNCTUATION: 467 case UnicodeType::OTHER_PUNCTUATION: 468 469 case UnicodeType::MATH_SYMBOL: 470 case UnicodeType::CURRENCY_SYMBOL: 471 case UnicodeType::MODIFIER_SYMBOL: 472 case UnicodeType::OTHER_SYMBOL: 473 return PRINTABLE; 474 475 // Control 476 case UnicodeType::CONTROL: 477 case UnicodeType::FORMAT: 478 return CONTROL; 479 480 case UnicodeType::LINE_SEPARATOR: 481 case UnicodeType::PARAGRAPH_SEPARATOR: 482 return CONTROL|PRINTABLE; 483 484 // for all others 485 default: 486 return 0; 487 } 488 } 489 490 491