1*cdf0e10cSrcweir /************************************************************************* 2*cdf0e10cSrcweir * 3*cdf0e10cSrcweir * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4*cdf0e10cSrcweir * 5*cdf0e10cSrcweir * Copyright 2000, 2010 Oracle and/or its affiliates. 6*cdf0e10cSrcweir * 7*cdf0e10cSrcweir * OpenOffice.org - a multi-platform office productivity suite 8*cdf0e10cSrcweir * 9*cdf0e10cSrcweir * This file is part of OpenOffice.org. 10*cdf0e10cSrcweir * 11*cdf0e10cSrcweir * OpenOffice.org is free software: you can redistribute it and/or modify 12*cdf0e10cSrcweir * it under the terms of the GNU Lesser General Public License version 3 13*cdf0e10cSrcweir * only, as published by the Free Software Foundation. 14*cdf0e10cSrcweir * 15*cdf0e10cSrcweir * OpenOffice.org is distributed in the hope that it will be useful, 16*cdf0e10cSrcweir * but WITHOUT ANY WARRANTY; without even the implied warranty of 17*cdf0e10cSrcweir * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18*cdf0e10cSrcweir * GNU Lesser General Public License version 3 for more details 19*cdf0e10cSrcweir * (a copy is included in the LICENSE file that accompanied this code). 20*cdf0e10cSrcweir * 21*cdf0e10cSrcweir * You should have received a copy of the GNU Lesser General Public License 22*cdf0e10cSrcweir * version 3 along with OpenOffice.org. If not, see 23*cdf0e10cSrcweir * <http://www.openoffice.org/license.html> 24*cdf0e10cSrcweir * for a copy of the LGPLv3 License. 25*cdf0e10cSrcweir * 26*cdf0e10cSrcweir ************************************************************************/ 27*cdf0e10cSrcweir 28*cdf0e10cSrcweir #include "unichars.h" 29*cdf0e10cSrcweir #include "osl/diagnose.h" 30*cdf0e10cSrcweir #include "sal/types.h" 31*cdf0e10cSrcweir 32*cdf0e10cSrcweir int ImplIsNoncharacter(sal_uInt32 nUtf32) 33*cdf0e10cSrcweir { 34*cdf0e10cSrcweir /* All code points that are noncharacters, as of Unicode 3.1.1. */ 35*cdf0e10cSrcweir return (nUtf32 >= 0xFDD0 && nUtf32 <= 0xFDEF) 36*cdf0e10cSrcweir || (nUtf32 & 0xFFFF) >= 0xFFFE 37*cdf0e10cSrcweir || nUtf32 > 0x10FFFF; 38*cdf0e10cSrcweir } 39*cdf0e10cSrcweir 40*cdf0e10cSrcweir int ImplIsControlOrFormat(sal_uInt32 nUtf32) 41*cdf0e10cSrcweir { 42*cdf0e10cSrcweir /* All code points of <http://www.unicode.org/Public/UNIDATA/ 43*cdf0e10cSrcweir UnicodeData.txt>, Version 3.1.1, that have a General Category of Cc 44*cdf0e10cSrcweir (Other, Control) or Cf (Other, Format). 45*cdf0e10cSrcweir */ 46*cdf0e10cSrcweir return nUtf32 <= 0x001F 47*cdf0e10cSrcweir || (nUtf32 >= 0x007F && nUtf32 <= 0x009F) 48*cdf0e10cSrcweir || nUtf32 == 0x070F /* SYRIAC ABBREVIATION MARK */ 49*cdf0e10cSrcweir || nUtf32 == 0x180B /* MONGOLIAN FREE VARIATION SELECTOR ONE */ 50*cdf0e10cSrcweir || nUtf32 == 0x180C /* MONGOLIAN FREE VARIATION SELECTOR TWO */ 51*cdf0e10cSrcweir || nUtf32 == 0x180D /* MONGOLIAN FREE VARIATION SELECTOR THREE */ 52*cdf0e10cSrcweir || nUtf32 == 0x180E /* MONGOLIAN VOWEL SEPARATOR */ 53*cdf0e10cSrcweir || nUtf32 == 0x200C /* ZERO WIDTH NON-JOINER */ 54*cdf0e10cSrcweir || nUtf32 == 0x200D /* ZERO WIDTH JOINER */ 55*cdf0e10cSrcweir || nUtf32 == 0x200E /* LEFT-TO-RIGHT MARK */ 56*cdf0e10cSrcweir || nUtf32 == 0x200F /* RIGHT-TO-LEFT MARK */ 57*cdf0e10cSrcweir || nUtf32 == 0x202A /* LEFT-TO-RIGHT EMBEDDING */ 58*cdf0e10cSrcweir || nUtf32 == 0x202B /* RIGHT-TO-LEFT EMBEDDING */ 59*cdf0e10cSrcweir || nUtf32 == 0x202C /* POP DIRECTIONAL FORMATTING */ 60*cdf0e10cSrcweir || nUtf32 == 0x202D /* LEFT-TO-RIGHT OVERRIDE */ 61*cdf0e10cSrcweir || nUtf32 == 0x202E /* RIGHT-TO-LEFT OVERRIDE */ 62*cdf0e10cSrcweir || nUtf32 == 0x206A /* INHIBIT SYMMETRIC SWAPPING */ 63*cdf0e10cSrcweir || nUtf32 == 0x206B /* ACTIVATE SYMMETRIC SWAPPING */ 64*cdf0e10cSrcweir || nUtf32 == 0x206C /* INHIBIT ARABIC FORM SHAPING */ 65*cdf0e10cSrcweir || nUtf32 == 0x206D /* ACTIVATE ARABIC FORM SHAPING */ 66*cdf0e10cSrcweir || nUtf32 == 0x206E /* NATIONAL DIGIT SHAPES */ 67*cdf0e10cSrcweir || nUtf32 == 0x206F /* NOMINAL DIGIT SHAPES */ 68*cdf0e10cSrcweir || nUtf32 == 0xFEFF /* ZERO WIDTH NO-BREAK SPACE */ 69*cdf0e10cSrcweir || nUtf32 == 0xFFF9 /* INTERLINEAR ANNOTATION ANCHOR */ 70*cdf0e10cSrcweir || nUtf32 == 0xFFFA /* INTERLINEAR ANNOTATION SEPARATOR */ 71*cdf0e10cSrcweir || nUtf32 == 0xFFFB /* INTERLINEAR ANNOTATION TERMINATOR */ 72*cdf0e10cSrcweir || nUtf32 == 0x1D173 /* MUSICAL SYMBOL BEGIN BEAM */ 73*cdf0e10cSrcweir || nUtf32 == 0x1D174 /* MUSICAL SYMBOL END BEAM */ 74*cdf0e10cSrcweir || nUtf32 == 0x1D175 /* MUSICAL SYMBOL BEGIN TIE */ 75*cdf0e10cSrcweir || nUtf32 == 0x1D176 /* MUSICAL SYMBOL END TIE */ 76*cdf0e10cSrcweir || nUtf32 == 0x1D177 /* MUSICAL SYMBOL BEGIN SLUR */ 77*cdf0e10cSrcweir || nUtf32 == 0x1D178 /* MUSICAL SYMBOL END SLUR */ 78*cdf0e10cSrcweir || nUtf32 == 0x1D179 /* MUSICAL SYMBOL BEGIN PHRASE */ 79*cdf0e10cSrcweir || nUtf32 == 0x1D17A /* MUSICAL SYMBOL END PHRASE */ 80*cdf0e10cSrcweir || nUtf32 == 0xE0001 /* LANGUAGE TAG */ 81*cdf0e10cSrcweir || (nUtf32 >= 0xE0020 && nUtf32 <= 0xE007F); 82*cdf0e10cSrcweir } 83*cdf0e10cSrcweir 84*cdf0e10cSrcweir int ImplIsHighSurrogate(sal_uInt32 nUtf32) 85*cdf0e10cSrcweir { 86*cdf0e10cSrcweir /* All code points that are high-surrogates, as of Unicode 3.1.1. */ 87*cdf0e10cSrcweir return nUtf32 >= 0xD800 && nUtf32 <= 0xDBFF; 88*cdf0e10cSrcweir } 89*cdf0e10cSrcweir 90*cdf0e10cSrcweir int ImplIsLowSurrogate(sal_uInt32 nUtf32) 91*cdf0e10cSrcweir { 92*cdf0e10cSrcweir /* All code points that are low-surrogates, as of Unicode 3.1.1. */ 93*cdf0e10cSrcweir return nUtf32 >= 0xDC00 && nUtf32 <= 0xDFFF; 94*cdf0e10cSrcweir } 95*cdf0e10cSrcweir 96*cdf0e10cSrcweir int ImplIsPrivateUse(sal_uInt32 nUtf32) 97*cdf0e10cSrcweir { 98*cdf0e10cSrcweir /* All code points of <http://www.unicode.org/Public/UNIDATA/ 99*cdf0e10cSrcweir UnicodeData.txt>, Version 3.1.1, that have a General Category of Co 100*cdf0e10cSrcweir (Other, Private Use). 101*cdf0e10cSrcweir */ 102*cdf0e10cSrcweir return (nUtf32 >= 0xE000 && nUtf32 <= 0xF8FF) 103*cdf0e10cSrcweir || (nUtf32 >= 0xF0000 && nUtf32 <= 0xFFFFD) 104*cdf0e10cSrcweir || (nUtf32 >= 0x100000 && nUtf32 <= 0x10FFFD); 105*cdf0e10cSrcweir } 106*cdf0e10cSrcweir 107*cdf0e10cSrcweir int ImplIsZeroWidth(sal_uInt32 nUtf32) 108*cdf0e10cSrcweir { 109*cdf0e10cSrcweir /* All code points of <http://www.unicode.org/Public/UNIDATA/ 110*cdf0e10cSrcweir UnicodeData.txt>, Version 3.1.1, that have "ZERO WIDTH" in their 111*cdf0e10cSrcweir Character name. 112*cdf0e10cSrcweir */ 113*cdf0e10cSrcweir return nUtf32 == 0x200B /* ZERO WIDTH SPACE */ 114*cdf0e10cSrcweir || nUtf32 == 0x200C /* ZERO WIDTH NON-JOINER */ 115*cdf0e10cSrcweir || nUtf32 == 0x200D /* ZERO WIDTH JOINER */ 116*cdf0e10cSrcweir || nUtf32 == 0xFEFF; /* ZEOR WIDTH NO-BREAK SPACE */ 117*cdf0e10cSrcweir } 118*cdf0e10cSrcweir 119*cdf0e10cSrcweir sal_uInt32 ImplGetHighSurrogate(sal_uInt32 nUtf32) 120*cdf0e10cSrcweir { 121*cdf0e10cSrcweir OSL_ENSURE(nUtf32 >= 0x10000, "specification violation"); 122*cdf0e10cSrcweir return ((nUtf32 - 0x10000) >> 10) | 0xD800; 123*cdf0e10cSrcweir } 124*cdf0e10cSrcweir 125*cdf0e10cSrcweir sal_uInt32 ImplGetLowSurrogate(sal_uInt32 nUtf32) 126*cdf0e10cSrcweir { 127*cdf0e10cSrcweir OSL_ENSURE(nUtf32 >= 0x10000, "specification violation"); 128*cdf0e10cSrcweir return ((nUtf32 - 0x10000) & 0x3FF) | 0xDC00; 129*cdf0e10cSrcweir } 130*cdf0e10cSrcweir 131*cdf0e10cSrcweir sal_uInt32 ImplCombineSurrogates(sal_uInt32 nHigh, sal_uInt32 nLow) 132*cdf0e10cSrcweir { 133*cdf0e10cSrcweir OSL_ENSURE(ImplIsHighSurrogate(nHigh) && ImplIsLowSurrogate(nLow), 134*cdf0e10cSrcweir "specification violation"); 135*cdf0e10cSrcweir return (((nHigh & 0x3FF) << 10) | (nLow & 0x3FF)) + 0x10000; 136*cdf0e10cSrcweir } 137