xref: /AOO41X/main/i18nutil/source/utility/widthfolding.cxx (revision cdf0e10c4e3984b49a9502b011690b615761d4a3)
1*cdf0e10cSrcweir /*************************************************************************
2*cdf0e10cSrcweir  *
3*cdf0e10cSrcweir  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4*cdf0e10cSrcweir  *
5*cdf0e10cSrcweir  * Copyright 2000, 2010 Oracle and/or its affiliates.
6*cdf0e10cSrcweir  *
7*cdf0e10cSrcweir  * OpenOffice.org - a multi-platform office productivity suite
8*cdf0e10cSrcweir  *
9*cdf0e10cSrcweir  * This file is part of OpenOffice.org.
10*cdf0e10cSrcweir  *
11*cdf0e10cSrcweir  * OpenOffice.org is free software: you can redistribute it and/or modify
12*cdf0e10cSrcweir  * it under the terms of the GNU Lesser General Public License version 3
13*cdf0e10cSrcweir  * only, as published by the Free Software Foundation.
14*cdf0e10cSrcweir  *
15*cdf0e10cSrcweir  * OpenOffice.org is distributed in the hope that it will be useful,
16*cdf0e10cSrcweir  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17*cdf0e10cSrcweir  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18*cdf0e10cSrcweir  * GNU Lesser General Public License version 3 for more details
19*cdf0e10cSrcweir  * (a copy is included in the LICENSE file that accompanied this code).
20*cdf0e10cSrcweir  *
21*cdf0e10cSrcweir  * You should have received a copy of the GNU Lesser General Public License
22*cdf0e10cSrcweir  * version 3 along with OpenOffice.org.  If not, see
23*cdf0e10cSrcweir  * <http://www.openoffice.org/license.html>
24*cdf0e10cSrcweir  * for a copy of the LGPLv3 License.
25*cdf0e10cSrcweir  *
26*cdf0e10cSrcweir  ************************************************************************/
27*cdf0e10cSrcweir 
28*cdf0e10cSrcweir // prevent internal compiler error with MSVC6SP3
29*cdf0e10cSrcweir #include <utility>
30*cdf0e10cSrcweir #include <i18nutil/widthfolding.hxx>
31*cdf0e10cSrcweir #include <i18nutil/x_rtl_ustring.h>
32*cdf0e10cSrcweir #include "widthfolding_data.h"
33*cdf0e10cSrcweir 
34*cdf0e10cSrcweir using namespace com::sun::star::uno;
35*cdf0e10cSrcweir using namespace rtl;
36*cdf0e10cSrcweir 
37*cdf0e10cSrcweir namespace com { namespace sun { namespace star { namespace i18n {
38*cdf0e10cSrcweir 
39*cdf0e10cSrcweir sal_Unicode widthfolding::decompose_ja_voiced_sound_marksChar2Char (sal_Unicode inChar)
40*cdf0e10cSrcweir {
41*cdf0e10cSrcweir     if (0x30a0 <= inChar && inChar <= 0x30ff) {
42*cdf0e10cSrcweir       sal_Int16 i = inChar - 0x3040;
43*cdf0e10cSrcweir       if (decomposition_table[i].decomposited_character_1)
44*cdf0e10cSrcweir           return 0xFFFF;
45*cdf0e10cSrcweir     }
46*cdf0e10cSrcweir     return inChar;
47*cdf0e10cSrcweir }
48*cdf0e10cSrcweir 
49*cdf0e10cSrcweir /**
50*cdf0e10cSrcweir  * Decompose Japanese specific voiced and semi-voiced sound marks.
51*cdf0e10cSrcweir  */
52*cdf0e10cSrcweir OUString widthfolding::decompose_ja_voiced_sound_marks (const OUString& inStr, sal_Int32 startPos, sal_Int32 nCount, Sequence< sal_Int32 >& offset, sal_Bool useOffset )
53*cdf0e10cSrcweir {
54*cdf0e10cSrcweir   // Create a string buffer which can hold nCount * 2 + 1 characters.
55*cdf0e10cSrcweir   // Its size may become double of nCount.
56*cdf0e10cSrcweir   rtl_uString * newStr;
57*cdf0e10cSrcweir   x_rtl_uString_new_WithLength( &newStr, nCount * 2 ); // defined in x_rtl_ustring.h  The reference count is 0 now.
58*cdf0e10cSrcweir 
59*cdf0e10cSrcweir   sal_Int32 *p = NULL;
60*cdf0e10cSrcweir   sal_Int32 position = 0;
61*cdf0e10cSrcweir   if (useOffset) {
62*cdf0e10cSrcweir       // Allocate double of nCount length to offset argument.
63*cdf0e10cSrcweir       offset.realloc( nCount * 2 );
64*cdf0e10cSrcweir       p = offset.getArray();
65*cdf0e10cSrcweir       position = startPos;
66*cdf0e10cSrcweir   }
67*cdf0e10cSrcweir 
68*cdf0e10cSrcweir   // Prepare pointers of unicode character arrays.
69*cdf0e10cSrcweir   const sal_Unicode* src = inStr.getStr() + startPos;
70*cdf0e10cSrcweir   sal_Unicode* dst = newStr->buffer;
71*cdf0e10cSrcweir 
72*cdf0e10cSrcweir   // Decomposition: GA --> KA + voice-mark
73*cdf0e10cSrcweir   while (nCount -- > 0) {
74*cdf0e10cSrcweir     sal_Unicode c = *src++;
75*cdf0e10cSrcweir     // see http://charts.unicode.org/Web/U3040.html Hiragana (U+3040..U+309F)
76*cdf0e10cSrcweir     // see http://charts.unicode.org/Web/U30A0.html Katakana (U+30A0..U+30FF)
77*cdf0e10cSrcweir     // Hiragana is not applied to decomposition.
78*cdf0e10cSrcweir     // Only Katakana is applied to decomposition
79*cdf0e10cSrcweir     if (0x30a0 <= c && c <= 0x30ff) {
80*cdf0e10cSrcweir       int i = int(c - 0x3040);
81*cdf0e10cSrcweir       sal_Unicode first = decomposition_table[i].decomposited_character_1;
82*cdf0e10cSrcweir       if (first != 0x0000) {
83*cdf0e10cSrcweir 	*dst ++ = first;
84*cdf0e10cSrcweir 	*dst ++ = decomposition_table[i].decomposited_character_2; // second
85*cdf0e10cSrcweir         if (useOffset) {
86*cdf0e10cSrcweir             *p ++ = position;
87*cdf0e10cSrcweir             *p ++ = position ++;
88*cdf0e10cSrcweir         }
89*cdf0e10cSrcweir 	continue;
90*cdf0e10cSrcweir       }
91*cdf0e10cSrcweir     }
92*cdf0e10cSrcweir     *dst ++ = c;
93*cdf0e10cSrcweir     if (useOffset)
94*cdf0e10cSrcweir         *p ++ = position ++;
95*cdf0e10cSrcweir   }
96*cdf0e10cSrcweir   *dst = (sal_Unicode) 0;
97*cdf0e10cSrcweir 
98*cdf0e10cSrcweir   newStr->length = sal_Int32(dst - newStr->buffer);
99*cdf0e10cSrcweir   if (useOffset)
100*cdf0e10cSrcweir       offset.realloc(newStr->length);
101*cdf0e10cSrcweir   return OUString( newStr ); // defined in rtl/usrting. The reference count is increased from 0 to 1.
102*cdf0e10cSrcweir }
103*cdf0e10cSrcweir 
104*cdf0e10cSrcweir oneToOneMapping& widthfolding::getfull2halfTable(void)
105*cdf0e10cSrcweir {
106*cdf0e10cSrcweir     static oneToOneMappingWithFlag table(full2half, sizeof(full2half), FULL2HALF_NORMAL);
107*cdf0e10cSrcweir     table.makeIndex();
108*cdf0e10cSrcweir     return table;
109*cdf0e10cSrcweir }
110*cdf0e10cSrcweir 
111*cdf0e10cSrcweir /**
112*cdf0e10cSrcweir  * Compose Japanese specific voiced and semi-voiced sound marks.
113*cdf0e10cSrcweir  */
114*cdf0e10cSrcweir OUString widthfolding::compose_ja_voiced_sound_marks (const OUString& inStr, sal_Int32 startPos, sal_Int32 nCount, Sequence< sal_Int32 >& offset, sal_Bool useOffset, sal_Int32 nFlags )
115*cdf0e10cSrcweir {
116*cdf0e10cSrcweir   // Create a string buffer which can hold nCount + 1 characters.
117*cdf0e10cSrcweir   // Its size may become equal to nCount or smaller.
118*cdf0e10cSrcweir   // The reference count is 0 now.
119*cdf0e10cSrcweir   rtl_uString * newStr = x_rtl_uString_new_WithLength( nCount ); // defined in x_rtl_ustring.h
120*cdf0e10cSrcweir 
121*cdf0e10cSrcweir   // Prepare pointers of unicode character arrays.
122*cdf0e10cSrcweir   const sal_Unicode* src = inStr.getStr() + startPos;
123*cdf0e10cSrcweir   sal_Unicode* dst = newStr->buffer;
124*cdf0e10cSrcweir 
125*cdf0e10cSrcweir   // This conversion algorithm requires at least one character.
126*cdf0e10cSrcweir  if (nCount > 0) {
127*cdf0e10cSrcweir 
128*cdf0e10cSrcweir   // .. .. KA         VOICE .. ..
129*cdf0e10cSrcweir   //       ^          ^
130*cdf0e10cSrcweir   //       previousChar   currentChar
131*cdf0e10cSrcweir   //       ^
132*cdf0e10cSrcweir   //       position
133*cdf0e10cSrcweir   //
134*cdf0e10cSrcweir   // will be converted to
135*cdf0e10cSrcweir   // .. .. GA       .. ..
136*cdf0e10cSrcweir 
137*cdf0e10cSrcweir   sal_Int32 *p = NULL;
138*cdf0e10cSrcweir   sal_Int32 position = 0;
139*cdf0e10cSrcweir   if (useOffset) {
140*cdf0e10cSrcweir       // Allocate nCount length to offset argument.
141*cdf0e10cSrcweir       offset.realloc( nCount );
142*cdf0e10cSrcweir       p = offset.getArray();
143*cdf0e10cSrcweir       position = startPos;
144*cdf0e10cSrcweir   }
145*cdf0e10cSrcweir 
146*cdf0e10cSrcweir   //
147*cdf0e10cSrcweir   sal_Unicode previousChar = *src ++;
148*cdf0e10cSrcweir   sal_Unicode currentChar;
149*cdf0e10cSrcweir 
150*cdf0e10cSrcweir   // Composition: KA + voice-mark --> GA
151*cdf0e10cSrcweir   while (-- nCount > 0) {
152*cdf0e10cSrcweir     currentChar = *src ++;
153*cdf0e10cSrcweir     // see http://charts.unicode.org/Web/U3040.html Hiragana (U+3040..U+309F)
154*cdf0e10cSrcweir     // see http://charts.unicode.org/Web/U30A0.html Katakana (U+30A0..U+30FF)
155*cdf0e10cSrcweir     // 0x3099 COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK
156*cdf0e10cSrcweir     // 0x309a COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
157*cdf0e10cSrcweir     // 0x309b KATAKANA-HIRAGANA VOICED SOUND MARK
158*cdf0e10cSrcweir     // 0x309c KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
159*cdf0e10cSrcweir     int j = currentChar - 0x3099; // 0x3099, 0x309a, 0x309b, 0x309c ?
160*cdf0e10cSrcweir 
161*cdf0e10cSrcweir     if (2 <= j && j <= 3) // 0x309b or 0x309c
162*cdf0e10cSrcweir         j -= 2;
163*cdf0e10cSrcweir 
164*cdf0e10cSrcweir     if (0 <= j && j <= 1) {
165*cdf0e10cSrcweir       // 0 addresses a code point regarding 0x3099 or 0x309b (voiced sound mark),
166*cdf0e10cSrcweir       // 1 is 0x309a or 0x309c (semi-voiced sound mark)
167*cdf0e10cSrcweir       int i = int(previousChar - 0x3040); // i acts as an index of array
168*cdf0e10cSrcweir       sal_Bool bCompose = sal_False;
169*cdf0e10cSrcweir 
170*cdf0e10cSrcweir       if (0 <= i && i <= (0x30ff - 0x3040) && composition_table[i][j])
171*cdf0e10cSrcweir         bCompose = sal_True;
172*cdf0e10cSrcweir 
173*cdf0e10cSrcweir       // not to use combined KATAKANA LETTER VU
174*cdf0e10cSrcweir       if ( previousChar == 0x30a6 && (nFlags & WIDTHFOLDNIG_DONT_USE_COMBINED_VU) )
175*cdf0e10cSrcweir         bCompose = sal_False;
176*cdf0e10cSrcweir 
177*cdf0e10cSrcweir       if( bCompose ){
178*cdf0e10cSrcweir         if (useOffset) {
179*cdf0e10cSrcweir             position ++;
180*cdf0e10cSrcweir             *p ++ = position ++;
181*cdf0e10cSrcweir         }
182*cdf0e10cSrcweir 	*dst ++ =  composition_table[i][j];
183*cdf0e10cSrcweir 	previousChar = *src ++;
184*cdf0e10cSrcweir 	nCount --;
185*cdf0e10cSrcweir 	continue;
186*cdf0e10cSrcweir       }
187*cdf0e10cSrcweir     }
188*cdf0e10cSrcweir     if (useOffset)
189*cdf0e10cSrcweir         *p ++ = position ++;
190*cdf0e10cSrcweir     *dst ++ = previousChar;
191*cdf0e10cSrcweir     previousChar = currentChar;
192*cdf0e10cSrcweir   }
193*cdf0e10cSrcweir 
194*cdf0e10cSrcweir   if (nCount == 0) {
195*cdf0e10cSrcweir     if (useOffset)
196*cdf0e10cSrcweir         *p = position;
197*cdf0e10cSrcweir     *dst ++ = previousChar;
198*cdf0e10cSrcweir   }
199*cdf0e10cSrcweir 
200*cdf0e10cSrcweir   *dst = (sal_Unicode) 0;
201*cdf0e10cSrcweir 
202*cdf0e10cSrcweir   newStr->length = sal_Int32(dst - newStr->buffer);
203*cdf0e10cSrcweir  }
204*cdf0e10cSrcweir   if (useOffset)
205*cdf0e10cSrcweir       offset.realloc(newStr->length);
206*cdf0e10cSrcweir   return OUString( newStr ); // defined in rtl/usrting. The reference count is increased from 0 to 1.
207*cdf0e10cSrcweir }
208*cdf0e10cSrcweir 
209*cdf0e10cSrcweir oneToOneMapping& widthfolding::gethalf2fullTable(void)
210*cdf0e10cSrcweir {
211*cdf0e10cSrcweir     static oneToOneMappingWithFlag table(half2full, sizeof(half2full), HALF2FULL_NORMAL);
212*cdf0e10cSrcweir     table.makeIndex();
213*cdf0e10cSrcweir     return table;
214*cdf0e10cSrcweir }
215*cdf0e10cSrcweir 
216*cdf0e10cSrcweir sal_Unicode widthfolding::getCompositionChar(sal_Unicode c1, sal_Unicode c2)
217*cdf0e10cSrcweir {
218*cdf0e10cSrcweir     return composition_table[c1 - 0x3040][c2 - 0x3099];
219*cdf0e10cSrcweir }
220*cdf0e10cSrcweir 
221*cdf0e10cSrcweir 
222*cdf0e10cSrcweir oneToOneMapping& widthfolding::getfull2halfTableForASC()
223*cdf0e10cSrcweir {
224*cdf0e10cSrcweir     static oneToOneMappingWithFlag table(full2half, sizeof(full2half), FULL2HALF_ASC_FUNCTION);
225*cdf0e10cSrcweir     table.makeIndex();
226*cdf0e10cSrcweir 
227*cdf0e10cSrcweir     // bluedwarf: dirty hack!
228*cdf0e10cSrcweir     // There is an exception. Additional conversion is required following:
229*cdf0e10cSrcweir     //  0xFFE5 (FULLWIDTH YEN SIGN)  --> 0x005C (REVERSE SOLIDUS)
230*cdf0e10cSrcweir     //
231*cdf0e10cSrcweir     //  See the following page for detail:
232*cdf0e10cSrcweir     // http://wiki.services.openoffice.org/wiki/Calc/Features/JIS_and_ASC_functions
233*cdf0e10cSrcweir     int i, j, high, low;
234*cdf0e10cSrcweir     int n = sizeof(full2halfASCException) / sizeof(UnicodePairWithFlag);
235*cdf0e10cSrcweir     for( i = 0; i < n; i++ )
236*cdf0e10cSrcweir     {
237*cdf0e10cSrcweir         high = (full2halfASCException[i].first >> 8) & 0xFF;
238*cdf0e10cSrcweir         low  = (full2halfASCException[i].first)      & 0xFF;
239*cdf0e10cSrcweir 
240*cdf0e10cSrcweir         if( !table.mpIndex[high] )
241*cdf0e10cSrcweir         {
242*cdf0e10cSrcweir             table.mpIndex[high] = new UnicodePairWithFlag*[256];
243*cdf0e10cSrcweir 
244*cdf0e10cSrcweir             for( j = 0; j < 256; j++ )
245*cdf0e10cSrcweir                 table.mpIndex[high][j] = NULL;
246*cdf0e10cSrcweir         }
247*cdf0e10cSrcweir         table.mpIndex[high][low] = &full2halfASCException[i];
248*cdf0e10cSrcweir     }
249*cdf0e10cSrcweir 
250*cdf0e10cSrcweir     return table;
251*cdf0e10cSrcweir }
252*cdf0e10cSrcweir 
253*cdf0e10cSrcweir oneToOneMapping& widthfolding::gethalf2fullTableForJIS()
254*cdf0e10cSrcweir {
255*cdf0e10cSrcweir     static oneToOneMappingWithFlag table(half2full, sizeof(half2full), HALF2FULL_JIS_FUNCTION);
256*cdf0e10cSrcweir     table.makeIndex();
257*cdf0e10cSrcweir 
258*cdf0e10cSrcweir     // bluedwarf: dirty hack!
259*cdf0e10cSrcweir     //  There are some exceptions. Additional conversion are required following:
260*cdf0e10cSrcweir     //  0x0022 (QUOTATION MARK)  --> 0x201D (RIGHT DOUBLE QUOTATION MARK)
261*cdf0e10cSrcweir     //  0x0027 (APOSTROPHE)      --> 0x2019 (RIGHT SINGLE QUOTATION MARK)
262*cdf0e10cSrcweir     //  0x005C (REVERSE SOLIDUS) --> 0xFFE5 (FULLWIDTH YEN SIGN)
263*cdf0e10cSrcweir     //  0x0060 (GRAVE ACCENT)    --> 0x2018 (LEFT SINGLE QUOTATION MARK)
264*cdf0e10cSrcweir     //
265*cdf0e10cSrcweir     //  See the following page for detail:
266*cdf0e10cSrcweir     // http://wiki.services.openoffice.org/wiki/Calc/Features/JIS_and_ASC_functions
267*cdf0e10cSrcweir     int i, j, high, low;
268*cdf0e10cSrcweir     int n = sizeof(half2fullJISException) / sizeof(UnicodePairWithFlag);
269*cdf0e10cSrcweir     for( i = 0; i < n; i++ )
270*cdf0e10cSrcweir     {
271*cdf0e10cSrcweir         high = (half2fullJISException[i].first >> 8) & 0xFF;
272*cdf0e10cSrcweir         low  = (half2fullJISException[i].first)      & 0xFF;
273*cdf0e10cSrcweir 
274*cdf0e10cSrcweir         if( !table.mpIndex[high] )
275*cdf0e10cSrcweir         {
276*cdf0e10cSrcweir             table.mpIndex[high] = new UnicodePairWithFlag*[256];
277*cdf0e10cSrcweir 
278*cdf0e10cSrcweir             for( j = 0; j < 256; j++ )
279*cdf0e10cSrcweir                 table.mpIndex[high][j] = NULL;
280*cdf0e10cSrcweir         }
281*cdf0e10cSrcweir         table.mpIndex[high][low] = &half2fullJISException[i];
282*cdf0e10cSrcweir     }
283*cdf0e10cSrcweir 
284*cdf0e10cSrcweir     return table;
285*cdf0e10cSrcweir }
286*cdf0e10cSrcweir 
287*cdf0e10cSrcweir oneToOneMapping& widthfolding::getfullKana2halfKanaTable()
288*cdf0e10cSrcweir {
289*cdf0e10cSrcweir     static oneToOneMappingWithFlag table(full2half, sizeof(full2half), FULL2HALF_KATAKANA_ONLY);
290*cdf0e10cSrcweir     table.makeIndex();
291*cdf0e10cSrcweir     return table;
292*cdf0e10cSrcweir }
293*cdf0e10cSrcweir 
294*cdf0e10cSrcweir oneToOneMapping& widthfolding::gethalfKana2fullKanaTable()
295*cdf0e10cSrcweir {
296*cdf0e10cSrcweir     static oneToOneMappingWithFlag table(half2full, sizeof(half2full), HALF2FULL_KATAKANA_ONLY);
297*cdf0e10cSrcweir     table.makeIndex();
298*cdf0e10cSrcweir     return table;
299*cdf0e10cSrcweir }
300*cdf0e10cSrcweir 
301*cdf0e10cSrcweir } } } }
302