xref: /AOO41X/main/i18npool/source/breakiterator/breakiteratorImpl.cxx (revision cdf0e10c4e3984b49a9502b011690b615761d4a3)
1*cdf0e10cSrcweir /*************************************************************************
2*cdf0e10cSrcweir  *
3*cdf0e10cSrcweir  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4*cdf0e10cSrcweir  *
5*cdf0e10cSrcweir  * Copyright 2000, 2010 Oracle and/or its affiliates.
6*cdf0e10cSrcweir  *
7*cdf0e10cSrcweir  * OpenOffice.org - a multi-platform office productivity suite
8*cdf0e10cSrcweir  *
9*cdf0e10cSrcweir  * This file is part of OpenOffice.org.
10*cdf0e10cSrcweir  *
11*cdf0e10cSrcweir  * OpenOffice.org is free software: you can redistribute it and/or modify
12*cdf0e10cSrcweir  * it under the terms of the GNU Lesser General Public License version 3
13*cdf0e10cSrcweir  * only, as published by the Free Software Foundation.
14*cdf0e10cSrcweir  *
15*cdf0e10cSrcweir  * OpenOffice.org is distributed in the hope that it will be useful,
16*cdf0e10cSrcweir  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17*cdf0e10cSrcweir  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18*cdf0e10cSrcweir  * GNU Lesser General Public License version 3 for more details
19*cdf0e10cSrcweir  * (a copy is included in the LICENSE file that accompanied this code).
20*cdf0e10cSrcweir  *
21*cdf0e10cSrcweir  * You should have received a copy of the GNU Lesser General Public License
22*cdf0e10cSrcweir  * version 3 along with OpenOffice.org.  If not, see
23*cdf0e10cSrcweir  * <http://www.openoffice.org/license.html>
24*cdf0e10cSrcweir  * for a copy of the LGPLv3 License.
25*cdf0e10cSrcweir  *
26*cdf0e10cSrcweir  ************************************************************************/
27*cdf0e10cSrcweir 
28*cdf0e10cSrcweir // MARKER(update_precomp.py): autogen include statement, do not remove
29*cdf0e10cSrcweir #include "precompiled_i18npool.hxx"
30*cdf0e10cSrcweir 
31*cdf0e10cSrcweir #include <breakiteratorImpl.hxx>
32*cdf0e10cSrcweir #include <unicode/uchar.h>
33*cdf0e10cSrcweir #include <rtl/ustrbuf.hxx>
34*cdf0e10cSrcweir 
35*cdf0e10cSrcweir using namespace ::com::sun::star::uno;
36*cdf0e10cSrcweir using namespace ::com::sun::star::lang;
37*cdf0e10cSrcweir using namespace ::rtl;
38*cdf0e10cSrcweir 
39*cdf0e10cSrcweir namespace com { namespace sun { namespace star { namespace i18n {
40*cdf0e10cSrcweir 
41*cdf0e10cSrcweir BreakIteratorImpl::BreakIteratorImpl( const Reference < XMultiServiceFactory >& rxMSF ) : xMSF( rxMSF )
42*cdf0e10cSrcweir {
43*cdf0e10cSrcweir }
44*cdf0e10cSrcweir 
45*cdf0e10cSrcweir BreakIteratorImpl::BreakIteratorImpl()
46*cdf0e10cSrcweir {
47*cdf0e10cSrcweir }
48*cdf0e10cSrcweir 
49*cdf0e10cSrcweir BreakIteratorImpl::~BreakIteratorImpl()
50*cdf0e10cSrcweir {
51*cdf0e10cSrcweir         // Clear lookuptable
52*cdf0e10cSrcweir         for (size_t l = 0; l < lookupTable.size(); l++)
53*cdf0e10cSrcweir             delete lookupTable[l];
54*cdf0e10cSrcweir         lookupTable.clear();
55*cdf0e10cSrcweir }
56*cdf0e10cSrcweir 
57*cdf0e10cSrcweir #define LBI getLocaleSpecificBreakIterator(rLocale)
58*cdf0e10cSrcweir 
59*cdf0e10cSrcweir sal_Int32 SAL_CALL BreakIteratorImpl::nextCharacters( const OUString& Text, sal_Int32 nStartPos,
60*cdf0e10cSrcweir         const Locale &rLocale, sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32& nDone )
61*cdf0e10cSrcweir         throw(RuntimeException)
62*cdf0e10cSrcweir {
63*cdf0e10cSrcweir         if (nCount < 0) throw RuntimeException();
64*cdf0e10cSrcweir 
65*cdf0e10cSrcweir         return LBI->nextCharacters( Text, nStartPos, rLocale, nCharacterIteratorMode, nCount, nDone);
66*cdf0e10cSrcweir }
67*cdf0e10cSrcweir 
68*cdf0e10cSrcweir sal_Int32 SAL_CALL BreakIteratorImpl::previousCharacters( const OUString& Text, sal_Int32 nStartPos,
69*cdf0e10cSrcweir         const Locale& rLocale, sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32& nDone )
70*cdf0e10cSrcweir         throw(RuntimeException)
71*cdf0e10cSrcweir {
72*cdf0e10cSrcweir         if (nCount < 0) throw RuntimeException();
73*cdf0e10cSrcweir 
74*cdf0e10cSrcweir         return LBI->previousCharacters( Text, nStartPos, rLocale, nCharacterIteratorMode, nCount, nDone);
75*cdf0e10cSrcweir }
76*cdf0e10cSrcweir 
77*cdf0e10cSrcweir #define isZWSP(c) (ch == 0x200B)
78*cdf0e10cSrcweir 
79*cdf0e10cSrcweir static sal_Int32 skipSpace(const OUString& Text, sal_Int32 nPos, sal_Int32 len, sal_Int16 rWordType, sal_Bool bDirection)
80*cdf0e10cSrcweir {
81*cdf0e10cSrcweir 		sal_uInt32 ch=0;
82*cdf0e10cSrcweir 		sal_Int32 pos=nPos;
83*cdf0e10cSrcweir         switch (rWordType) {
84*cdf0e10cSrcweir             case WordType::ANYWORD_IGNOREWHITESPACES:
85*cdf0e10cSrcweir                 if (bDirection)
86*cdf0e10cSrcweir                     while (nPos < len && (u_isWhitespace(ch = Text.iterateCodePoints(&pos, 1)) || isZWSP(ch))) nPos=pos;
87*cdf0e10cSrcweir                 else
88*cdf0e10cSrcweir                     while (nPos > 0 && (u_isWhitespace(ch = Text.iterateCodePoints(&pos, -1)) || isZWSP(ch))) nPos=pos;
89*cdf0e10cSrcweir             break;
90*cdf0e10cSrcweir             case WordType::DICTIONARY_WORD:
91*cdf0e10cSrcweir                 if (bDirection)
92*cdf0e10cSrcweir                     while (nPos < len && (u_isWhitespace(ch = Text.iterateCodePoints(&pos, 1)) || isZWSP(ch) ||
93*cdf0e10cSrcweir                             ! (ch == 0x002E || u_isalnum(ch)))) nPos=pos;
94*cdf0e10cSrcweir                 else
95*cdf0e10cSrcweir                     while (nPos > 0 && (u_isWhitespace(ch = Text.iterateCodePoints(&pos, -1)) || isZWSP(ch) ||
96*cdf0e10cSrcweir                             ! (ch == 0x002E || u_isalnum(ch)))) nPos=pos;
97*cdf0e10cSrcweir             break;
98*cdf0e10cSrcweir             case WordType::WORD_COUNT:
99*cdf0e10cSrcweir                 if (bDirection)
100*cdf0e10cSrcweir                     while (nPos < len && (u_isUWhiteSpace(ch = Text.iterateCodePoints(&pos, 1)) || isZWSP(ch))) nPos=pos;
101*cdf0e10cSrcweir                 else
102*cdf0e10cSrcweir                     while (nPos > 0 && (u_isUWhiteSpace(ch = Text.iterateCodePoints(&pos, -1)) || isZWSP(ch))) nPos=pos;
103*cdf0e10cSrcweir             break;
104*cdf0e10cSrcweir         }
105*cdf0e10cSrcweir         return nPos;
106*cdf0e10cSrcweir }
107*cdf0e10cSrcweir 
108*cdf0e10cSrcweir Boundary SAL_CALL BreakIteratorImpl::nextWord( const OUString& Text, sal_Int32 nStartPos,
109*cdf0e10cSrcweir         const Locale& rLocale, sal_Int16 rWordType ) throw(RuntimeException)
110*cdf0e10cSrcweir {
111*cdf0e10cSrcweir         sal_Int32 len = Text.getLength();
112*cdf0e10cSrcweir         if( nStartPos < 0 || len == 0 )
113*cdf0e10cSrcweir             result.endPos = result.startPos = 0;
114*cdf0e10cSrcweir         else if (nStartPos >= len)
115*cdf0e10cSrcweir             result.endPos = result.startPos = len;
116*cdf0e10cSrcweir         else {
117*cdf0e10cSrcweir             result = LBI->nextWord(Text, nStartPos, rLocale, rWordType);
118*cdf0e10cSrcweir 
119*cdf0e10cSrcweir             nStartPos = skipSpace(Text, result.startPos, len, rWordType, sal_True);
120*cdf0e10cSrcweir 
121*cdf0e10cSrcweir             if ( nStartPos != result.startPos) {
122*cdf0e10cSrcweir                 if( nStartPos >= len )
123*cdf0e10cSrcweir                     result.startPos = result.endPos = len;
124*cdf0e10cSrcweir                 else {
125*cdf0e10cSrcweir                     result = LBI->getWordBoundary(Text, nStartPos, rLocale, rWordType, sal_True);
126*cdf0e10cSrcweir                     // i88041: avoid startPos goes back to nStartPos when switching between Latin and CJK scripts
127*cdf0e10cSrcweir                     if (result.startPos < nStartPos) result.startPos = nStartPos;
128*cdf0e10cSrcweir                 }
129*cdf0e10cSrcweir             }
130*cdf0e10cSrcweir         }
131*cdf0e10cSrcweir         return result;
132*cdf0e10cSrcweir }
133*cdf0e10cSrcweir 
134*cdf0e10cSrcweir static inline sal_Bool SAL_CALL isCJK( const Locale& rLocale ) {
135*cdf0e10cSrcweir         return rLocale.Language.equalsAscii("zh") || rLocale.Language.equalsAscii("ja") || rLocale.Language.equalsAscii("ko");
136*cdf0e10cSrcweir }
137*cdf0e10cSrcweir 
138*cdf0e10cSrcweir Boundary SAL_CALL BreakIteratorImpl::previousWord( const OUString& Text, sal_Int32 nStartPos,
139*cdf0e10cSrcweir         const Locale& rLocale, sal_Int16 rWordType) throw(RuntimeException)
140*cdf0e10cSrcweir {
141*cdf0e10cSrcweir         sal_Int32 len = Text.getLength();
142*cdf0e10cSrcweir         if( nStartPos <= 0 || len == 0 ) {
143*cdf0e10cSrcweir             result.endPos = result.startPos = 0;
144*cdf0e10cSrcweir             return result;
145*cdf0e10cSrcweir         } else if (nStartPos > len) {
146*cdf0e10cSrcweir             result.endPos = result.startPos = len;
147*cdf0e10cSrcweir             return result;
148*cdf0e10cSrcweir         }
149*cdf0e10cSrcweir 
150*cdf0e10cSrcweir         sal_Int32 nPos = skipSpace(Text, nStartPos, len, rWordType, sal_False);
151*cdf0e10cSrcweir 
152*cdf0e10cSrcweir         // if some spaces are skiped, and the script type is Asian with no CJK rLocale, we have to return
153*cdf0e10cSrcweir         // (nStartPos, -1) for caller to send correct rLocale for loading correct dictionary.
154*cdf0e10cSrcweir         result.startPos = nPos;
155*cdf0e10cSrcweir         if (nPos != nStartPos && nPos > 0 && !isCJK(rLocale) && getScriptClass(Text.iterateCodePoints(&nPos, -1)) == ScriptType::ASIAN) {
156*cdf0e10cSrcweir             result.endPos = -1;
157*cdf0e10cSrcweir             return result;
158*cdf0e10cSrcweir         }
159*cdf0e10cSrcweir 
160*cdf0e10cSrcweir         return LBI->previousWord(Text, result.startPos, rLocale, rWordType);
161*cdf0e10cSrcweir }
162*cdf0e10cSrcweir 
163*cdf0e10cSrcweir 
164*cdf0e10cSrcweir Boundary SAL_CALL BreakIteratorImpl::getWordBoundary( const OUString& Text, sal_Int32 nPos, const Locale& rLocale,
165*cdf0e10cSrcweir         sal_Int16 rWordType, sal_Bool bDirection ) throw(RuntimeException)
166*cdf0e10cSrcweir {
167*cdf0e10cSrcweir         sal_Int32 len = Text.getLength();
168*cdf0e10cSrcweir         if( nPos < 0 || len == 0 )
169*cdf0e10cSrcweir             result.endPos = result.startPos = 0;
170*cdf0e10cSrcweir         else if (nPos > len)
171*cdf0e10cSrcweir             result.endPos = result.startPos = len;
172*cdf0e10cSrcweir         else {
173*cdf0e10cSrcweir             sal_Int32 next, prev;
174*cdf0e10cSrcweir             next = skipSpace(Text, nPos, len, rWordType, sal_True);
175*cdf0e10cSrcweir             prev = skipSpace(Text, nPos, len, rWordType, sal_False);
176*cdf0e10cSrcweir             if (prev == 0 && next == len) {
177*cdf0e10cSrcweir                 result.endPos = result.startPos = nPos;
178*cdf0e10cSrcweir             } else if (prev == 0 && ! bDirection) {
179*cdf0e10cSrcweir                 result.endPos = result.startPos = 0;
180*cdf0e10cSrcweir             } else if (next == len && bDirection) {
181*cdf0e10cSrcweir                 result.endPos = result.startPos = len;
182*cdf0e10cSrcweir             } else {
183*cdf0e10cSrcweir                 if (next != prev) {
184*cdf0e10cSrcweir                     if (next == nPos && next != len)
185*cdf0e10cSrcweir                         bDirection = sal_True;
186*cdf0e10cSrcweir                     else if (prev == nPos && prev != 0)
187*cdf0e10cSrcweir                         bDirection = sal_False;
188*cdf0e10cSrcweir                     else
189*cdf0e10cSrcweir                         nPos = bDirection ? next : prev;
190*cdf0e10cSrcweir                 }
191*cdf0e10cSrcweir                 result = LBI->getWordBoundary(Text, nPos, rLocale, rWordType, bDirection);
192*cdf0e10cSrcweir             }
193*cdf0e10cSrcweir         }
194*cdf0e10cSrcweir         return result;
195*cdf0e10cSrcweir }
196*cdf0e10cSrcweir 
197*cdf0e10cSrcweir sal_Bool SAL_CALL BreakIteratorImpl::isBeginWord( const OUString& Text, sal_Int32 nPos,
198*cdf0e10cSrcweir         const Locale& rLocale, sal_Int16 rWordType ) throw(RuntimeException)
199*cdf0e10cSrcweir {
200*cdf0e10cSrcweir         sal_Int32 len = Text.getLength();
201*cdf0e10cSrcweir 
202*cdf0e10cSrcweir         if (nPos < 0 || nPos >= len) return sal_False;
203*cdf0e10cSrcweir 
204*cdf0e10cSrcweir         sal_Int32 tmp = skipSpace(Text, nPos, len, rWordType, sal_True);
205*cdf0e10cSrcweir 
206*cdf0e10cSrcweir         if (tmp != nPos) return sal_False;
207*cdf0e10cSrcweir 
208*cdf0e10cSrcweir         result = getWordBoundary(Text, nPos, rLocale, rWordType, sal_True);
209*cdf0e10cSrcweir 
210*cdf0e10cSrcweir         return result.startPos == nPos;
211*cdf0e10cSrcweir }
212*cdf0e10cSrcweir 
213*cdf0e10cSrcweir sal_Bool SAL_CALL BreakIteratorImpl::isEndWord( const OUString& Text, sal_Int32 nPos,
214*cdf0e10cSrcweir         const Locale& rLocale, sal_Int16 rWordType ) throw(RuntimeException)
215*cdf0e10cSrcweir {
216*cdf0e10cSrcweir         sal_Int32 len = Text.getLength();
217*cdf0e10cSrcweir 
218*cdf0e10cSrcweir         if (nPos <= 0 || nPos > len) return sal_False;
219*cdf0e10cSrcweir 
220*cdf0e10cSrcweir         sal_Int32 tmp = skipSpace(Text, nPos, len, rWordType, sal_False);
221*cdf0e10cSrcweir 
222*cdf0e10cSrcweir         if (tmp != nPos) return sal_False;
223*cdf0e10cSrcweir 
224*cdf0e10cSrcweir         result = getWordBoundary(Text, nPos, rLocale, rWordType, sal_False);
225*cdf0e10cSrcweir 
226*cdf0e10cSrcweir         return result.endPos == nPos;
227*cdf0e10cSrcweir }
228*cdf0e10cSrcweir 
229*cdf0e10cSrcweir sal_Int32 SAL_CALL BreakIteratorImpl::beginOfSentence( const OUString& Text, sal_Int32 nStartPos,
230*cdf0e10cSrcweir         const Locale &rLocale ) throw(RuntimeException)
231*cdf0e10cSrcweir {
232*cdf0e10cSrcweir         if (nStartPos < 0 || nStartPos > Text.getLength())
233*cdf0e10cSrcweir             return -1;
234*cdf0e10cSrcweir         if (Text.getLength() == 0) return 0;
235*cdf0e10cSrcweir         return LBI->beginOfSentence(Text, nStartPos, rLocale);
236*cdf0e10cSrcweir }
237*cdf0e10cSrcweir 
238*cdf0e10cSrcweir sal_Int32 SAL_CALL BreakIteratorImpl::endOfSentence( const OUString& Text, sal_Int32 nStartPos,
239*cdf0e10cSrcweir         const Locale &rLocale ) throw(RuntimeException)
240*cdf0e10cSrcweir {
241*cdf0e10cSrcweir         if (nStartPos < 0 || nStartPos > Text.getLength())
242*cdf0e10cSrcweir             return -1;
243*cdf0e10cSrcweir         if (Text.getLength() == 0) return 0;
244*cdf0e10cSrcweir         return LBI->endOfSentence(Text, nStartPos, rLocale);
245*cdf0e10cSrcweir }
246*cdf0e10cSrcweir 
247*cdf0e10cSrcweir LineBreakResults SAL_CALL BreakIteratorImpl::getLineBreak( const OUString& Text, sal_Int32 nStartPos,
248*cdf0e10cSrcweir         const Locale& rLocale, sal_Int32 nMinBreakPos, const LineBreakHyphenationOptions& hOptions,
249*cdf0e10cSrcweir         const LineBreakUserOptions& bOptions ) throw(RuntimeException)
250*cdf0e10cSrcweir {
251*cdf0e10cSrcweir         return LBI->getLineBreak(Text, nStartPos, rLocale, nMinBreakPos, hOptions, bOptions);
252*cdf0e10cSrcweir }
253*cdf0e10cSrcweir 
254*cdf0e10cSrcweir sal_Int16 SAL_CALL BreakIteratorImpl::getScriptType( const OUString& Text, sal_Int32 nPos )
255*cdf0e10cSrcweir         throw(RuntimeException)
256*cdf0e10cSrcweir {
257*cdf0e10cSrcweir         return (nPos < 0 || nPos >= Text.getLength()) ? ScriptType::WEAK :
258*cdf0e10cSrcweir                             getScriptClass(Text.iterateCodePoints(&nPos, 0));
259*cdf0e10cSrcweir }
260*cdf0e10cSrcweir 
261*cdf0e10cSrcweir 
262*cdf0e10cSrcweir /** Increments/decrements position first, then obtains character.
263*cdf0e10cSrcweir     @return current position, may be -1 or text length if string was consumed.
264*cdf0e10cSrcweir  */
265*cdf0e10cSrcweir static sal_Int32 SAL_CALL iterateCodePoints(const OUString& Text, sal_Int32 &nStartPos, sal_Int32 inc, sal_uInt32& ch) {
266*cdf0e10cSrcweir         sal_Int32 nLen = Text.getLength();
267*cdf0e10cSrcweir 		if (nStartPos + inc < 0 || nStartPos + inc >= nLen) {
268*cdf0e10cSrcweir 			ch = 0;
269*cdf0e10cSrcweir 			nStartPos = nStartPos + inc < 0 ? -1 : nLen;
270*cdf0e10cSrcweir 		} else {
271*cdf0e10cSrcweir 			ch = Text.iterateCodePoints(&nStartPos, inc);
272*cdf0e10cSrcweir             // Fix for #i80436#.
273*cdf0e10cSrcweir             // erAck: 2009-06-30T21:52+0200  This logic looks somewhat
274*cdf0e10cSrcweir             // suspicious as if it cures a symptom.. anyway, had to add
275*cdf0e10cSrcweir             // nStartPos < Text.getLength() to silence the (correct) assertion
276*cdf0e10cSrcweir             // in rtl_uString_iterateCodePoints() if Text was one character
277*cdf0e10cSrcweir             // (codepoint) only, made up of a surrogate pair.
278*cdf0e10cSrcweir             //if (inc > 0 && nStartPos < Text.getLength())
279*cdf0e10cSrcweir             //    ch = Text.iterateCodePoints(&nStartPos, 0);
280*cdf0e10cSrcweir             // With surrogates, nStartPos may actually point behind string
281*cdf0e10cSrcweir             // now, even if inc is only +1
282*cdf0e10cSrcweir 			if (inc > 0)
283*cdf0e10cSrcweir                 ch = (nStartPos < nLen ? Text.iterateCodePoints(&nStartPos, 0) : 0);
284*cdf0e10cSrcweir 		}
285*cdf0e10cSrcweir 		return nStartPos;
286*cdf0e10cSrcweir }
287*cdf0e10cSrcweir 
288*cdf0e10cSrcweir 
289*cdf0e10cSrcweir sal_Int32 SAL_CALL BreakIteratorImpl::beginOfScript( const OUString& Text,
290*cdf0e10cSrcweir         sal_Int32 nStartPos, sal_Int16 ScriptType ) throw(RuntimeException)
291*cdf0e10cSrcweir {
292*cdf0e10cSrcweir         if (nStartPos < 0 || nStartPos >= Text.getLength())
293*cdf0e10cSrcweir             return -1;
294*cdf0e10cSrcweir 
295*cdf0e10cSrcweir         if(ScriptType != getScriptClass(Text.iterateCodePoints(&nStartPos, 0)))
296*cdf0e10cSrcweir             return -1;
297*cdf0e10cSrcweir 
298*cdf0e10cSrcweir 		if (nStartPos == 0) return 0;
299*cdf0e10cSrcweir 		sal_uInt32 ch=0;
300*cdf0e10cSrcweir         while (iterateCodePoints(Text, nStartPos, -1, ch) >= 0 && ScriptType == getScriptClass(ch)) {
301*cdf0e10cSrcweir 			if (nStartPos == 0) return 0;
302*cdf0e10cSrcweir 		}
303*cdf0e10cSrcweir 
304*cdf0e10cSrcweir         return  iterateCodePoints(Text, nStartPos, 1, ch);
305*cdf0e10cSrcweir }
306*cdf0e10cSrcweir 
307*cdf0e10cSrcweir sal_Int32 SAL_CALL BreakIteratorImpl::endOfScript( const OUString& Text,
308*cdf0e10cSrcweir         sal_Int32 nStartPos, sal_Int16 ScriptType ) throw(RuntimeException)
309*cdf0e10cSrcweir {
310*cdf0e10cSrcweir         if (nStartPos < 0 || nStartPos >= Text.getLength())
311*cdf0e10cSrcweir             return -1;
312*cdf0e10cSrcweir 
313*cdf0e10cSrcweir         if(ScriptType != getScriptClass(Text.iterateCodePoints(&nStartPos, 0)))
314*cdf0e10cSrcweir             return -1;
315*cdf0e10cSrcweir 
316*cdf0e10cSrcweir         sal_Int32 strLen = Text.getLength();
317*cdf0e10cSrcweir 		sal_uInt32 ch=0;
318*cdf0e10cSrcweir         while(iterateCodePoints(Text, nStartPos, 1, ch) < strLen ) {
319*cdf0e10cSrcweir             sal_Int16 currentCharScriptType = getScriptClass(ch);
320*cdf0e10cSrcweir             if(ScriptType != currentCharScriptType && currentCharScriptType != ScriptType::WEAK)
321*cdf0e10cSrcweir                 break;
322*cdf0e10cSrcweir         }
323*cdf0e10cSrcweir         return  nStartPos;
324*cdf0e10cSrcweir }
325*cdf0e10cSrcweir 
326*cdf0e10cSrcweir sal_Int32  SAL_CALL BreakIteratorImpl::previousScript( const OUString& Text,
327*cdf0e10cSrcweir         sal_Int32 nStartPos, sal_Int16 ScriptType ) throw(RuntimeException)
328*cdf0e10cSrcweir {
329*cdf0e10cSrcweir         if (nStartPos < 0)
330*cdf0e10cSrcweir             return -1;
331*cdf0e10cSrcweir         if (nStartPos > Text.getLength())
332*cdf0e10cSrcweir             nStartPos = Text.getLength();
333*cdf0e10cSrcweir 
334*cdf0e10cSrcweir         sal_Int16 numberOfChange = (ScriptType == getScriptClass(Text.iterateCodePoints(&nStartPos, 0))) ? 3 : 2;
335*cdf0e10cSrcweir 
336*cdf0e10cSrcweir 		sal_uInt32 ch=0;
337*cdf0e10cSrcweir         while (numberOfChange > 0 && iterateCodePoints(Text, nStartPos, -1, ch) >= 0) {
338*cdf0e10cSrcweir 			if ((((numberOfChange % 2) == 0) ^ (ScriptType != getScriptClass(ch))))
339*cdf0e10cSrcweir 				numberOfChange--;
340*cdf0e10cSrcweir 			else if (nStartPos == 0) {
341*cdf0e10cSrcweir 				if (numberOfChange > 0)
342*cdf0e10cSrcweir 					numberOfChange--;
343*cdf0e10cSrcweir 				if (nStartPos > 0)
344*cdf0e10cSrcweir 					Text.iterateCodePoints(&nStartPos, -1);
345*cdf0e10cSrcweir 				else
346*cdf0e10cSrcweir 					return -1;
347*cdf0e10cSrcweir 			}
348*cdf0e10cSrcweir         }
349*cdf0e10cSrcweir         return numberOfChange == 0 ? iterateCodePoints(Text, nStartPos, 1, ch) : -1;
350*cdf0e10cSrcweir }
351*cdf0e10cSrcweir 
352*cdf0e10cSrcweir sal_Int32 SAL_CALL BreakIteratorImpl::nextScript( const OUString& Text, sal_Int32 nStartPos,
353*cdf0e10cSrcweir         sal_Int16 ScriptType ) throw(RuntimeException)
354*cdf0e10cSrcweir 
355*cdf0e10cSrcweir {
356*cdf0e10cSrcweir         if (nStartPos < 0)
357*cdf0e10cSrcweir             nStartPos = 0;
358*cdf0e10cSrcweir         sal_Int32 strLen = Text.getLength();
359*cdf0e10cSrcweir         if (nStartPos > strLen)
360*cdf0e10cSrcweir             return -1;
361*cdf0e10cSrcweir 
362*cdf0e10cSrcweir         sal_Int16 numberOfChange = (ScriptType == getScriptClass(Text.iterateCodePoints(&nStartPos, 0))) ? 2 : 1;
363*cdf0e10cSrcweir 
364*cdf0e10cSrcweir 		sal_uInt32 ch=0;
365*cdf0e10cSrcweir         while (numberOfChange > 0 && iterateCodePoints(Text, nStartPos, 1, ch) < strLen) {
366*cdf0e10cSrcweir 			sal_Int16 currentCharScriptType = getScriptClass(ch);
367*cdf0e10cSrcweir 			if ((numberOfChange == 1) ? (ScriptType == currentCharScriptType) :
368*cdf0e10cSrcweir 					(ScriptType != currentCharScriptType && currentCharScriptType != ScriptType::WEAK))
369*cdf0e10cSrcweir 				numberOfChange--;
370*cdf0e10cSrcweir         }
371*cdf0e10cSrcweir         return numberOfChange == 0 ? nStartPos : -1;
372*cdf0e10cSrcweir }
373*cdf0e10cSrcweir 
374*cdf0e10cSrcweir sal_Int32 SAL_CALL BreakIteratorImpl::beginOfCharBlock( const OUString& Text, sal_Int32 nStartPos,
375*cdf0e10cSrcweir         const Locale& /*rLocale*/, sal_Int16 CharType ) throw(RuntimeException)
376*cdf0e10cSrcweir {
377*cdf0e10cSrcweir         if (CharType == CharType::ANY_CHAR) return 0;
378*cdf0e10cSrcweir         if (nStartPos < 0 || nStartPos >= Text.getLength()) return -1;
379*cdf0e10cSrcweir         if (CharType != (sal_Int16)u_charType( Text.iterateCodePoints(&nStartPos, 0))) return -1;
380*cdf0e10cSrcweir 
381*cdf0e10cSrcweir         sal_Int32 nPos=nStartPos;
382*cdf0e10cSrcweir         while(nStartPos > 0 && CharType == (sal_Int16)u_charType(Text.iterateCodePoints(&nPos, -1))) { nStartPos=nPos; }
383*cdf0e10cSrcweir         return nStartPos; // begin of char block is inclusive
384*cdf0e10cSrcweir }
385*cdf0e10cSrcweir 
386*cdf0e10cSrcweir sal_Int32 SAL_CALL BreakIteratorImpl::endOfCharBlock( const OUString& Text, sal_Int32 nStartPos,
387*cdf0e10cSrcweir         const Locale& /*rLocale*/, sal_Int16 CharType ) throw(RuntimeException)
388*cdf0e10cSrcweir {
389*cdf0e10cSrcweir         sal_Int32 strLen = Text.getLength();
390*cdf0e10cSrcweir 
391*cdf0e10cSrcweir         if (CharType == CharType::ANY_CHAR) return strLen; // end of char block is exclusive
392*cdf0e10cSrcweir         if (nStartPos < 0 || nStartPos >= strLen) return -1;
393*cdf0e10cSrcweir         if (CharType != (sal_Int16)u_charType(Text.iterateCodePoints(&nStartPos, 0))) return -1;
394*cdf0e10cSrcweir 
395*cdf0e10cSrcweir 		sal_uInt32 ch=0;
396*cdf0e10cSrcweir         while(iterateCodePoints(Text, nStartPos, 1, ch) < strLen && CharType == (sal_Int16)u_charType(ch)) {}
397*cdf0e10cSrcweir         return nStartPos; // end of char block is exclusive
398*cdf0e10cSrcweir }
399*cdf0e10cSrcweir 
400*cdf0e10cSrcweir sal_Int32 SAL_CALL BreakIteratorImpl::nextCharBlock( const OUString& Text, sal_Int32 nStartPos,
401*cdf0e10cSrcweir         const Locale& /*rLocale*/, sal_Int16 CharType ) throw(RuntimeException)
402*cdf0e10cSrcweir {
403*cdf0e10cSrcweir         if (CharType == CharType::ANY_CHAR) return -1;
404*cdf0e10cSrcweir         if (nStartPos < 0 || nStartPos >= Text.getLength()) return -1;
405*cdf0e10cSrcweir 
406*cdf0e10cSrcweir         sal_Int16 numberOfChange = (CharType == (sal_Int16)u_charType(Text.iterateCodePoints(&nStartPos, 0))) ? 2 : 1;
407*cdf0e10cSrcweir         sal_Int32 strLen = Text.getLength();
408*cdf0e10cSrcweir 
409*cdf0e10cSrcweir 	sal_uInt32 ch=0;
410*cdf0e10cSrcweir 	while (numberOfChange > 0 && iterateCodePoints(Text, nStartPos, 1, ch) < strLen) {
411*cdf0e10cSrcweir 		if ((CharType != (sal_Int16)u_charType(ch)) ^ (numberOfChange == 1))
412*cdf0e10cSrcweir 			numberOfChange--;
413*cdf0e10cSrcweir     }
414*cdf0e10cSrcweir     return numberOfChange == 0 ? nStartPos : -1;
415*cdf0e10cSrcweir }
416*cdf0e10cSrcweir 
417*cdf0e10cSrcweir sal_Int32 SAL_CALL BreakIteratorImpl::previousCharBlock( const OUString& Text, sal_Int32 nStartPos,
418*cdf0e10cSrcweir         const Locale& /*rLocale*/, sal_Int16 CharType ) throw(RuntimeException)
419*cdf0e10cSrcweir {
420*cdf0e10cSrcweir         if(CharType == CharType::ANY_CHAR) return -1;
421*cdf0e10cSrcweir         if (nStartPos < 0 || nStartPos >= Text.getLength()) return -1;
422*cdf0e10cSrcweir 
423*cdf0e10cSrcweir         sal_Int16 numberOfChange = (CharType == (sal_Int16)u_charType(Text.iterateCodePoints(&nStartPos, 0))) ? 3 : 2;
424*cdf0e10cSrcweir 
425*cdf0e10cSrcweir 		sal_uInt32 ch=0;
426*cdf0e10cSrcweir         while (numberOfChange > 0 && iterateCodePoints(Text, nStartPos, -1, ch) >= 0) {
427*cdf0e10cSrcweir 			if (((numberOfChange % 2) == 0) ^ (CharType != (sal_Int16)u_charType(ch)))
428*cdf0e10cSrcweir 				numberOfChange--;
429*cdf0e10cSrcweir 			if (nStartPos == 0 && numberOfChange > 0) {
430*cdf0e10cSrcweir 				numberOfChange--;
431*cdf0e10cSrcweir 				if (numberOfChange == 0) return nStartPos;
432*cdf0e10cSrcweir 			}
433*cdf0e10cSrcweir         }
434*cdf0e10cSrcweir         return numberOfChange == 0 ? iterateCodePoints(Text, nStartPos, 1, ch) : -1;
435*cdf0e10cSrcweir }
436*cdf0e10cSrcweir 
437*cdf0e10cSrcweir 
438*cdf0e10cSrcweir 
439*cdf0e10cSrcweir sal_Int16 SAL_CALL BreakIteratorImpl::getWordType( const OUString& /*Text*/,
440*cdf0e10cSrcweir         sal_Int32 /*nPos*/, const Locale& /*rLocale*/ ) throw(RuntimeException)
441*cdf0e10cSrcweir {
442*cdf0e10cSrcweir         return 0;
443*cdf0e10cSrcweir }
444*cdf0e10cSrcweir 
445*cdf0e10cSrcweir typedef struct {
446*cdf0e10cSrcweir     UBlockCode from;
447*cdf0e10cSrcweir     UBlockCode to;
448*cdf0e10cSrcweir     sal_Int16 script;
449*cdf0e10cSrcweir } UBlock2Script;
450*cdf0e10cSrcweir 
451*cdf0e10cSrcweir // for a list of the UBLOCK_... values see:
452*cdf0e10cSrcweir // http://icu-project.org/apiref/icu4c/uchar_8h.html
453*cdf0e10cSrcweir // where enum UBlockCode is defined.
454*cdf0e10cSrcweir // See also http://www.unicode.org/charts/ for general reference
455*cdf0e10cSrcweir static UBlock2Script scriptList[] = {
456*cdf0e10cSrcweir     {UBLOCK_NO_BLOCK, UBLOCK_NO_BLOCK, ScriptType::WEAK},
457*cdf0e10cSrcweir     {UBLOCK_BASIC_LATIN, UBLOCK_ARMENIAN, ScriptType::LATIN},
458*cdf0e10cSrcweir     {UBLOCK_HEBREW, UBLOCK_MYANMAR, ScriptType::COMPLEX},
459*cdf0e10cSrcweir     {UBLOCK_GEORGIAN, UBLOCK_GEORGIAN, ScriptType::LATIN},
460*cdf0e10cSrcweir     {UBLOCK_HANGUL_JAMO, UBLOCK_HANGUL_JAMO, ScriptType::ASIAN},
461*cdf0e10cSrcweir     {UBLOCK_ETHIOPIC, UBLOCK_ETHIOPIC, ScriptType::COMPLEX},
462*cdf0e10cSrcweir     {UBLOCK_CHEROKEE, UBLOCK_RUNIC, ScriptType::LATIN},
463*cdf0e10cSrcweir     {UBLOCK_KHMER, UBLOCK_MONGOLIAN, ScriptType::COMPLEX},
464*cdf0e10cSrcweir     {UBLOCK_LATIN_EXTENDED_ADDITIONAL, UBLOCK_GREEK_EXTENDED, ScriptType::LATIN},
465*cdf0e10cSrcweir     {UBLOCK_CJK_RADICALS_SUPPLEMENT, UBLOCK_HANGUL_SYLLABLES, ScriptType::ASIAN},
466*cdf0e10cSrcweir     {UBLOCK_CJK_COMPATIBILITY_IDEOGRAPHS, UBLOCK_CJK_COMPATIBILITY_IDEOGRAPHS, ScriptType::ASIAN},
467*cdf0e10cSrcweir     {UBLOCK_ARABIC_PRESENTATION_FORMS_A, UBLOCK_ARABIC_PRESENTATION_FORMS_A, ScriptType::COMPLEX},
468*cdf0e10cSrcweir     {UBLOCK_CJK_COMPATIBILITY_FORMS, UBLOCK_CJK_COMPATIBILITY_FORMS, ScriptType::ASIAN},
469*cdf0e10cSrcweir     {UBLOCK_ARABIC_PRESENTATION_FORMS_B, UBLOCK_ARABIC_PRESENTATION_FORMS_B, ScriptType::COMPLEX},
470*cdf0e10cSrcweir     {UBLOCK_HALFWIDTH_AND_FULLWIDTH_FORMS, UBLOCK_HALFWIDTH_AND_FULLWIDTH_FORMS, ScriptType::ASIAN},
471*cdf0e10cSrcweir     {UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B, UBLOCK_CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT, ScriptType::ASIAN},
472*cdf0e10cSrcweir     {UBLOCK_CJK_STROKES, UBLOCK_CJK_STROKES, ScriptType::ASIAN},
473*cdf0e10cSrcweir     {UBLOCK_LATIN_EXTENDED_C, UBLOCK_LATIN_EXTENDED_D, ScriptType::LATIN}
474*cdf0e10cSrcweir };
475*cdf0e10cSrcweir 
476*cdf0e10cSrcweir #define scriptListCount sizeof (scriptList) / sizeof (UBlock2Script)
477*cdf0e10cSrcweir 
478*cdf0e10cSrcweir sal_Int16  BreakIteratorImpl::getScriptClass(sal_uInt32 currentChar)
479*cdf0e10cSrcweir {
480*cdf0e10cSrcweir         static sal_uInt32 lastChar = 0;
481*cdf0e10cSrcweir         static sal_Int16 nRet = 0;
482*cdf0e10cSrcweir 
483*cdf0e10cSrcweir         if (currentChar != lastChar) {
484*cdf0e10cSrcweir             lastChar = currentChar;
485*cdf0e10cSrcweir 
486*cdf0e10cSrcweir             //JP 21.9.2001: handle specific characters - always as weak
487*cdf0e10cSrcweir             //                  definition of 1 - this breaks a word
488*cdf0e10cSrcweir             //                  2 - this can be inside a word
489*cdf0e10cSrcweir             //                  0x20 & 0xA0 - Bug 102975, declare western space and non-break space as WEAK char.
490*cdf0e10cSrcweir             if( 1 == currentChar || 2 == currentChar || 0x20 == currentChar || 0xA0 == currentChar)
491*cdf0e10cSrcweir                 nRet = ScriptType::WEAK;
492*cdf0e10cSrcweir             // workaround for Coptic
493*cdf0e10cSrcweir             else if ( 0x2C80 <= currentChar && 0x2CE3 >= currentChar)
494*cdf0e10cSrcweir                 nRet = ScriptType::LATIN;
495*cdf0e10cSrcweir             // work-around for ligatures (see http://www.unicode.org/charts/PDF/UFB00.pdf)
496*cdf0e10cSrcweir             else if ((0xFB00 <= currentChar && currentChar <= 0xFB06) ||
497*cdf0e10cSrcweir                      (0xFB13 <= currentChar && currentChar <= 0xFB17))
498*cdf0e10cSrcweir                 nRet = ScriptType::LATIN;
499*cdf0e10cSrcweir             else {
500*cdf0e10cSrcweir                 UBlockCode block=ublock_getCode(currentChar);
501*cdf0e10cSrcweir                 sal_uInt16 i;
502*cdf0e10cSrcweir                 for ( i = 0; i < scriptListCount; i++) {
503*cdf0e10cSrcweir                     if (block <= scriptList[i].to) break;
504*cdf0e10cSrcweir                 }
505*cdf0e10cSrcweir                 nRet=(i < scriptListCount && block >= scriptList[i].from) ? scriptList[i].script : ScriptType::WEAK;
506*cdf0e10cSrcweir             }
507*cdf0e10cSrcweir         }
508*cdf0e10cSrcweir         return nRet;
509*cdf0e10cSrcweir }
510*cdf0e10cSrcweir 
511*cdf0e10cSrcweir static inline sal_Bool operator == (const Locale& l1, const Locale& l2) {
512*cdf0e10cSrcweir         return l1.Language == l2.Language && l1.Country == l2.Country && l1.Variant == l2.Variant;
513*cdf0e10cSrcweir }
514*cdf0e10cSrcweir 
515*cdf0e10cSrcweir sal_Bool SAL_CALL BreakIteratorImpl::createLocaleSpecificBreakIterator(const OUString& aLocaleName) throw( RuntimeException )
516*cdf0e10cSrcweir {
517*cdf0e10cSrcweir         // to share service between same Language but different Country code, like zh_CN and zh_TW
518*cdf0e10cSrcweir         for (size_t l = 0; l < lookupTable.size(); l++) {
519*cdf0e10cSrcweir             lookupTableItem *listItem = lookupTable[l];
520*cdf0e10cSrcweir             if (aLocaleName == listItem->aLocale.Language) {
521*cdf0e10cSrcweir                 xBI = listItem->xBI;
522*cdf0e10cSrcweir                 return sal_True;
523*cdf0e10cSrcweir             }
524*cdf0e10cSrcweir         }
525*cdf0e10cSrcweir 
526*cdf0e10cSrcweir         Reference < uno::XInterface > xI = xMSF->createInstance(
527*cdf0e10cSrcweir             OUString::createFromAscii("com.sun.star.i18n.BreakIterator_") + aLocaleName);
528*cdf0e10cSrcweir 
529*cdf0e10cSrcweir         if ( xI.is() ) {
530*cdf0e10cSrcweir             xI->queryInterface( getCppuType((const Reference< XBreakIterator>*)0) ) >>= xBI;
531*cdf0e10cSrcweir             if (xBI.is()) {
532*cdf0e10cSrcweir                 lookupTable.push_back(new lookupTableItem(Locale(aLocaleName, aLocaleName, aLocaleName), xBI));
533*cdf0e10cSrcweir                 return sal_True;
534*cdf0e10cSrcweir             }
535*cdf0e10cSrcweir         }
536*cdf0e10cSrcweir         return sal_False;
537*cdf0e10cSrcweir }
538*cdf0e10cSrcweir 
539*cdf0e10cSrcweir Reference < XBreakIterator > SAL_CALL
540*cdf0e10cSrcweir BreakIteratorImpl::getLocaleSpecificBreakIterator(const Locale& rLocale) throw (RuntimeException)
541*cdf0e10cSrcweir {
542*cdf0e10cSrcweir         if (xBI.is() && rLocale == aLocale)
543*cdf0e10cSrcweir             return xBI;
544*cdf0e10cSrcweir         else if (xMSF.is()) {
545*cdf0e10cSrcweir             aLocale = rLocale;
546*cdf0e10cSrcweir 
547*cdf0e10cSrcweir             for (size_t i = 0; i < lookupTable.size(); i++) {
548*cdf0e10cSrcweir                 lookupTableItem *listItem = lookupTable[i];
549*cdf0e10cSrcweir                 if (rLocale == listItem->aLocale)
550*cdf0e10cSrcweir                     return xBI = listItem->xBI;
551*cdf0e10cSrcweir             }
552*cdf0e10cSrcweir 
553*cdf0e10cSrcweir             sal_Unicode under = (sal_Unicode)'_';
554*cdf0e10cSrcweir 
555*cdf0e10cSrcweir             sal_Int32 l = rLocale.Language.getLength();
556*cdf0e10cSrcweir             sal_Int32 c = rLocale.Country.getLength();
557*cdf0e10cSrcweir             sal_Int32 v = rLocale.Variant.getLength();
558*cdf0e10cSrcweir             OUStringBuffer aBuf(l+c+v+3);
559*cdf0e10cSrcweir 
560*cdf0e10cSrcweir             if ((l > 0 && c > 0 && v > 0 &&
561*cdf0e10cSrcweir                     // load service with name <base>_<lang>_<country>_<varian>
562*cdf0e10cSrcweir                     createLocaleSpecificBreakIterator(aBuf.append(rLocale.Language).append(under).append(
563*cdf0e10cSrcweir                                     rLocale.Country).append(under).append(rLocale.Variant).makeStringAndClear())) ||
564*cdf0e10cSrcweir                 (l > 0 && c > 0 &&
565*cdf0e10cSrcweir                     // load service with name <base>_<lang>_<country>
566*cdf0e10cSrcweir                     createLocaleSpecificBreakIterator(aBuf.append(rLocale.Language).append(under).append(
567*cdf0e10cSrcweir                                     rLocale.Country).makeStringAndClear())) ||
568*cdf0e10cSrcweir                 (l > 0 && c > 0 && rLocale.Language.compareToAscii("zh") == 0 &&
569*cdf0e10cSrcweir                                     (rLocale.Country.compareToAscii("HK") == 0 ||
570*cdf0e10cSrcweir                                     rLocale.Country.compareToAscii("MO") == 0) &&
571*cdf0e10cSrcweir                     // if the country code is HK or MO, one more step to try TW.
572*cdf0e10cSrcweir                     createLocaleSpecificBreakIterator(aBuf.append(rLocale.Language).append(under).appendAscii(
573*cdf0e10cSrcweir                                     "TW").makeStringAndClear())) ||
574*cdf0e10cSrcweir                 (l > 0 &&
575*cdf0e10cSrcweir                     // load service with name <base>_<lang>
576*cdf0e10cSrcweir                     createLocaleSpecificBreakIterator(rLocale.Language)) ||
577*cdf0e10cSrcweir                     // load default service with name <base>_Unicode
578*cdf0e10cSrcweir                     createLocaleSpecificBreakIterator(OUString::createFromAscii("Unicode"))) {
579*cdf0e10cSrcweir                 lookupTable.push_back( new lookupTableItem(aLocale, xBI) );
580*cdf0e10cSrcweir                 return xBI;
581*cdf0e10cSrcweir             }
582*cdf0e10cSrcweir         }
583*cdf0e10cSrcweir         throw RuntimeException();
584*cdf0e10cSrcweir }
585*cdf0e10cSrcweir 
586*cdf0e10cSrcweir const sal_Char cBreakIterator[] = "com.sun.star.i18n.BreakIterator";
587*cdf0e10cSrcweir 
588*cdf0e10cSrcweir OUString SAL_CALL
589*cdf0e10cSrcweir BreakIteratorImpl::getImplementationName(void) throw( RuntimeException )
590*cdf0e10cSrcweir {
591*cdf0e10cSrcweir         return OUString::createFromAscii(cBreakIterator);
592*cdf0e10cSrcweir }
593*cdf0e10cSrcweir 
594*cdf0e10cSrcweir sal_Bool SAL_CALL
595*cdf0e10cSrcweir BreakIteratorImpl::supportsService(const OUString& rServiceName) throw( RuntimeException )
596*cdf0e10cSrcweir {
597*cdf0e10cSrcweir         return !rServiceName.compareToAscii(cBreakIterator);
598*cdf0e10cSrcweir }
599*cdf0e10cSrcweir 
600*cdf0e10cSrcweir Sequence< OUString > SAL_CALL
601*cdf0e10cSrcweir BreakIteratorImpl::getSupportedServiceNames(void) throw( RuntimeException )
602*cdf0e10cSrcweir {
603*cdf0e10cSrcweir         Sequence< OUString > aRet(1);
604*cdf0e10cSrcweir         aRet[0] = OUString::createFromAscii(cBreakIterator);
605*cdf0e10cSrcweir         return aRet;
606*cdf0e10cSrcweir }
607*cdf0e10cSrcweir 
608*cdf0e10cSrcweir } } } }
609*cdf0e10cSrcweir 
610