xref: /AOO41X/main/sal/textenc/unichars.c (revision cdf0e10c4e3984b49a9502b011690b615761d4a3)
1*cdf0e10cSrcweir /*************************************************************************
2*cdf0e10cSrcweir  *
3*cdf0e10cSrcweir  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4*cdf0e10cSrcweir  *
5*cdf0e10cSrcweir  * Copyright 2000, 2010 Oracle and/or its affiliates.
6*cdf0e10cSrcweir  *
7*cdf0e10cSrcweir  * OpenOffice.org - a multi-platform office productivity suite
8*cdf0e10cSrcweir  *
9*cdf0e10cSrcweir  * This file is part of OpenOffice.org.
10*cdf0e10cSrcweir  *
11*cdf0e10cSrcweir  * OpenOffice.org is free software: you can redistribute it and/or modify
12*cdf0e10cSrcweir  * it under the terms of the GNU Lesser General Public License version 3
13*cdf0e10cSrcweir  * only, as published by the Free Software Foundation.
14*cdf0e10cSrcweir  *
15*cdf0e10cSrcweir  * OpenOffice.org is distributed in the hope that it will be useful,
16*cdf0e10cSrcweir  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17*cdf0e10cSrcweir  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18*cdf0e10cSrcweir  * GNU Lesser General Public License version 3 for more details
19*cdf0e10cSrcweir  * (a copy is included in the LICENSE file that accompanied this code).
20*cdf0e10cSrcweir  *
21*cdf0e10cSrcweir  * You should have received a copy of the GNU Lesser General Public License
22*cdf0e10cSrcweir  * version 3 along with OpenOffice.org.  If not, see
23*cdf0e10cSrcweir  * <http://www.openoffice.org/license.html>
24*cdf0e10cSrcweir  * for a copy of the LGPLv3 License.
25*cdf0e10cSrcweir  *
26*cdf0e10cSrcweir  ************************************************************************/
27*cdf0e10cSrcweir 
28*cdf0e10cSrcweir #include "unichars.h"
29*cdf0e10cSrcweir #include "osl/diagnose.h"
30*cdf0e10cSrcweir #include "sal/types.h"
31*cdf0e10cSrcweir 
32*cdf0e10cSrcweir int ImplIsNoncharacter(sal_uInt32 nUtf32)
33*cdf0e10cSrcweir {
34*cdf0e10cSrcweir     /* All code points that are noncharacters, as of Unicode 3.1.1. */
35*cdf0e10cSrcweir     return (nUtf32 >= 0xFDD0 && nUtf32 <= 0xFDEF)
36*cdf0e10cSrcweir            || (nUtf32 & 0xFFFF) >= 0xFFFE
37*cdf0e10cSrcweir            || nUtf32 > 0x10FFFF;
38*cdf0e10cSrcweir }
39*cdf0e10cSrcweir 
40*cdf0e10cSrcweir int ImplIsControlOrFormat(sal_uInt32 nUtf32)
41*cdf0e10cSrcweir {
42*cdf0e10cSrcweir     /* All code points of <http://www.unicode.org/Public/UNIDATA/
43*cdf0e10cSrcweir        UnicodeData.txt>, Version 3.1.1, that have a General Category of Cc
44*cdf0e10cSrcweir        (Other, Control) or Cf (Other, Format).
45*cdf0e10cSrcweir      */
46*cdf0e10cSrcweir     return nUtf32 <= 0x001F
47*cdf0e10cSrcweir            || (nUtf32 >= 0x007F && nUtf32 <= 0x009F)
48*cdf0e10cSrcweir            || nUtf32 == 0x070F /* SYRIAC ABBREVIATION MARK */
49*cdf0e10cSrcweir            || nUtf32 == 0x180B /* MONGOLIAN FREE VARIATION SELECTOR ONE */
50*cdf0e10cSrcweir            || nUtf32 == 0x180C /* MONGOLIAN FREE VARIATION SELECTOR TWO */
51*cdf0e10cSrcweir            || nUtf32 == 0x180D /* MONGOLIAN FREE VARIATION SELECTOR THREE */
52*cdf0e10cSrcweir            || nUtf32 == 0x180E /* MONGOLIAN VOWEL SEPARATOR */
53*cdf0e10cSrcweir            || nUtf32 == 0x200C /* ZERO WIDTH NON-JOINER */
54*cdf0e10cSrcweir            || nUtf32 == 0x200D /* ZERO WIDTH JOINER */
55*cdf0e10cSrcweir            || nUtf32 == 0x200E /* LEFT-TO-RIGHT MARK */
56*cdf0e10cSrcweir            || nUtf32 == 0x200F /* RIGHT-TO-LEFT MARK */
57*cdf0e10cSrcweir            || nUtf32 == 0x202A /* LEFT-TO-RIGHT EMBEDDING */
58*cdf0e10cSrcweir            || nUtf32 == 0x202B /* RIGHT-TO-LEFT EMBEDDING */
59*cdf0e10cSrcweir            || nUtf32 == 0x202C /* POP DIRECTIONAL FORMATTING */
60*cdf0e10cSrcweir            || nUtf32 == 0x202D /* LEFT-TO-RIGHT OVERRIDE */
61*cdf0e10cSrcweir            || nUtf32 == 0x202E /* RIGHT-TO-LEFT OVERRIDE */
62*cdf0e10cSrcweir            || nUtf32 == 0x206A /* INHIBIT SYMMETRIC SWAPPING */
63*cdf0e10cSrcweir            || nUtf32 == 0x206B /* ACTIVATE SYMMETRIC SWAPPING */
64*cdf0e10cSrcweir            || nUtf32 == 0x206C /* INHIBIT ARABIC FORM SHAPING */
65*cdf0e10cSrcweir            || nUtf32 == 0x206D /* ACTIVATE ARABIC FORM SHAPING */
66*cdf0e10cSrcweir            || nUtf32 == 0x206E /* NATIONAL DIGIT SHAPES */
67*cdf0e10cSrcweir            || nUtf32 == 0x206F /* NOMINAL DIGIT SHAPES */
68*cdf0e10cSrcweir            || nUtf32 == 0xFEFF /* ZERO WIDTH NO-BREAK SPACE */
69*cdf0e10cSrcweir            || nUtf32 == 0xFFF9 /* INTERLINEAR ANNOTATION ANCHOR */
70*cdf0e10cSrcweir            || nUtf32 == 0xFFFA /* INTERLINEAR ANNOTATION SEPARATOR */
71*cdf0e10cSrcweir            || nUtf32 == 0xFFFB /* INTERLINEAR ANNOTATION TERMINATOR */
72*cdf0e10cSrcweir            || nUtf32 == 0x1D173 /* MUSICAL SYMBOL BEGIN BEAM */
73*cdf0e10cSrcweir            || nUtf32 == 0x1D174 /* MUSICAL SYMBOL END BEAM */
74*cdf0e10cSrcweir            || nUtf32 == 0x1D175 /* MUSICAL SYMBOL BEGIN TIE */
75*cdf0e10cSrcweir            || nUtf32 == 0x1D176 /* MUSICAL SYMBOL END TIE */
76*cdf0e10cSrcweir            || nUtf32 == 0x1D177 /* MUSICAL SYMBOL BEGIN SLUR */
77*cdf0e10cSrcweir            || nUtf32 == 0x1D178 /* MUSICAL SYMBOL END SLUR */
78*cdf0e10cSrcweir            || nUtf32 == 0x1D179 /* MUSICAL SYMBOL BEGIN PHRASE */
79*cdf0e10cSrcweir            || nUtf32 == 0x1D17A /* MUSICAL SYMBOL END PHRASE */
80*cdf0e10cSrcweir            || nUtf32 == 0xE0001 /* LANGUAGE TAG */
81*cdf0e10cSrcweir            || (nUtf32 >= 0xE0020 && nUtf32 <= 0xE007F);
82*cdf0e10cSrcweir }
83*cdf0e10cSrcweir 
84*cdf0e10cSrcweir int ImplIsHighSurrogate(sal_uInt32 nUtf32)
85*cdf0e10cSrcweir {
86*cdf0e10cSrcweir     /* All code points that are high-surrogates, as of Unicode 3.1.1. */
87*cdf0e10cSrcweir     return nUtf32 >= 0xD800 && nUtf32 <= 0xDBFF;
88*cdf0e10cSrcweir }
89*cdf0e10cSrcweir 
90*cdf0e10cSrcweir int ImplIsLowSurrogate(sal_uInt32 nUtf32)
91*cdf0e10cSrcweir {
92*cdf0e10cSrcweir     /* All code points that are low-surrogates, as of Unicode 3.1.1. */
93*cdf0e10cSrcweir     return nUtf32 >= 0xDC00 && nUtf32 <= 0xDFFF;
94*cdf0e10cSrcweir }
95*cdf0e10cSrcweir 
96*cdf0e10cSrcweir int ImplIsPrivateUse(sal_uInt32 nUtf32)
97*cdf0e10cSrcweir {
98*cdf0e10cSrcweir     /* All code points of <http://www.unicode.org/Public/UNIDATA/
99*cdf0e10cSrcweir        UnicodeData.txt>, Version 3.1.1, that have a General Category of Co
100*cdf0e10cSrcweir        (Other, Private Use).
101*cdf0e10cSrcweir      */
102*cdf0e10cSrcweir     return (nUtf32 >= 0xE000 && nUtf32 <= 0xF8FF)
103*cdf0e10cSrcweir            || (nUtf32 >= 0xF0000 && nUtf32 <= 0xFFFFD)
104*cdf0e10cSrcweir            || (nUtf32 >= 0x100000 && nUtf32 <= 0x10FFFD);
105*cdf0e10cSrcweir }
106*cdf0e10cSrcweir 
107*cdf0e10cSrcweir int ImplIsZeroWidth(sal_uInt32 nUtf32)
108*cdf0e10cSrcweir {
109*cdf0e10cSrcweir     /* All code points of <http://www.unicode.org/Public/UNIDATA/
110*cdf0e10cSrcweir        UnicodeData.txt>, Version 3.1.1, that have "ZERO WIDTH" in their
111*cdf0e10cSrcweir        Character name.
112*cdf0e10cSrcweir      */
113*cdf0e10cSrcweir     return nUtf32 == 0x200B /* ZERO WIDTH SPACE */
114*cdf0e10cSrcweir            || nUtf32 == 0x200C /* ZERO WIDTH NON-JOINER */
115*cdf0e10cSrcweir            || nUtf32 == 0x200D /* ZERO WIDTH JOINER */
116*cdf0e10cSrcweir            || nUtf32 == 0xFEFF; /* ZEOR WIDTH NO-BREAK SPACE */
117*cdf0e10cSrcweir }
118*cdf0e10cSrcweir 
119*cdf0e10cSrcweir sal_uInt32 ImplGetHighSurrogate(sal_uInt32 nUtf32)
120*cdf0e10cSrcweir {
121*cdf0e10cSrcweir     OSL_ENSURE(nUtf32 >= 0x10000, "specification violation");
122*cdf0e10cSrcweir     return ((nUtf32 - 0x10000) >> 10) | 0xD800;
123*cdf0e10cSrcweir }
124*cdf0e10cSrcweir 
125*cdf0e10cSrcweir sal_uInt32 ImplGetLowSurrogate(sal_uInt32 nUtf32)
126*cdf0e10cSrcweir {
127*cdf0e10cSrcweir     OSL_ENSURE(nUtf32 >= 0x10000, "specification violation");
128*cdf0e10cSrcweir     return ((nUtf32 - 0x10000) & 0x3FF) | 0xDC00;
129*cdf0e10cSrcweir }
130*cdf0e10cSrcweir 
131*cdf0e10cSrcweir sal_uInt32 ImplCombineSurrogates(sal_uInt32 nHigh, sal_uInt32 nLow)
132*cdf0e10cSrcweir {
133*cdf0e10cSrcweir     OSL_ENSURE(ImplIsHighSurrogate(nHigh) && ImplIsLowSurrogate(nLow),
134*cdf0e10cSrcweir                "specification violation");
135*cdf0e10cSrcweir     return (((nHigh & 0x3FF) << 10) | (nLow & 0x3FF)) + 0x10000;
136*cdf0e10cSrcweir }
137