1*cdf0e10cSrcweir /************************************************************************* 2*cdf0e10cSrcweir * 3*cdf0e10cSrcweir * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4*cdf0e10cSrcweir * 5*cdf0e10cSrcweir * Copyright 2000, 2010 Oracle and/or its affiliates. 6*cdf0e10cSrcweir * 7*cdf0e10cSrcweir * OpenOffice.org - a multi-platform office productivity suite 8*cdf0e10cSrcweir * 9*cdf0e10cSrcweir * This file is part of OpenOffice.org. 10*cdf0e10cSrcweir * 11*cdf0e10cSrcweir * OpenOffice.org is free software: you can redistribute it and/or modify 12*cdf0e10cSrcweir * it under the terms of the GNU Lesser General Public License version 3 13*cdf0e10cSrcweir * only, as published by the Free Software Foundation. 14*cdf0e10cSrcweir * 15*cdf0e10cSrcweir * OpenOffice.org is distributed in the hope that it will be useful, 16*cdf0e10cSrcweir * but WITHOUT ANY WARRANTY; without even the implied warranty of 17*cdf0e10cSrcweir * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18*cdf0e10cSrcweir * GNU Lesser General Public License version 3 for more details 19*cdf0e10cSrcweir * (a copy is included in the LICENSE file that accompanied this code). 20*cdf0e10cSrcweir * 21*cdf0e10cSrcweir * You should have received a copy of the GNU Lesser General Public License 22*cdf0e10cSrcweir * version 3 along with OpenOffice.org. If not, see 23*cdf0e10cSrcweir * <http://www.openoffice.org/license.html> 24*cdf0e10cSrcweir * for a copy of the LGPLv3 License. 25*cdf0e10cSrcweir * 26*cdf0e10cSrcweir ************************************************************************/ 27*cdf0e10cSrcweir 28*cdf0e10cSrcweir // MARKER(update_precomp.py): autogen include statement, do not remove 29*cdf0e10cSrcweir #include "precompiled_i18npool.hxx" 30*cdf0e10cSrcweir 31*cdf0e10cSrcweir #include <breakiteratorImpl.hxx> 32*cdf0e10cSrcweir #include <unicode/uchar.h> 33*cdf0e10cSrcweir #include <rtl/ustrbuf.hxx> 34*cdf0e10cSrcweir 35*cdf0e10cSrcweir using namespace ::com::sun::star::uno; 36*cdf0e10cSrcweir using namespace ::com::sun::star::lang; 37*cdf0e10cSrcweir using namespace ::rtl; 38*cdf0e10cSrcweir 39*cdf0e10cSrcweir namespace com { namespace sun { namespace star { namespace i18n { 40*cdf0e10cSrcweir 41*cdf0e10cSrcweir BreakIteratorImpl::BreakIteratorImpl( const Reference < XMultiServiceFactory >& rxMSF ) : xMSF( rxMSF ) 42*cdf0e10cSrcweir { 43*cdf0e10cSrcweir } 44*cdf0e10cSrcweir 45*cdf0e10cSrcweir BreakIteratorImpl::BreakIteratorImpl() 46*cdf0e10cSrcweir { 47*cdf0e10cSrcweir } 48*cdf0e10cSrcweir 49*cdf0e10cSrcweir BreakIteratorImpl::~BreakIteratorImpl() 50*cdf0e10cSrcweir { 51*cdf0e10cSrcweir // Clear lookuptable 52*cdf0e10cSrcweir for (size_t l = 0; l < lookupTable.size(); l++) 53*cdf0e10cSrcweir delete lookupTable[l]; 54*cdf0e10cSrcweir lookupTable.clear(); 55*cdf0e10cSrcweir } 56*cdf0e10cSrcweir 57*cdf0e10cSrcweir #define LBI getLocaleSpecificBreakIterator(rLocale) 58*cdf0e10cSrcweir 59*cdf0e10cSrcweir sal_Int32 SAL_CALL BreakIteratorImpl::nextCharacters( const OUString& Text, sal_Int32 nStartPos, 60*cdf0e10cSrcweir const Locale &rLocale, sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32& nDone ) 61*cdf0e10cSrcweir throw(RuntimeException) 62*cdf0e10cSrcweir { 63*cdf0e10cSrcweir if (nCount < 0) throw RuntimeException(); 64*cdf0e10cSrcweir 65*cdf0e10cSrcweir return LBI->nextCharacters( Text, nStartPos, rLocale, nCharacterIteratorMode, nCount, nDone); 66*cdf0e10cSrcweir } 67*cdf0e10cSrcweir 68*cdf0e10cSrcweir sal_Int32 SAL_CALL BreakIteratorImpl::previousCharacters( const OUString& Text, sal_Int32 nStartPos, 69*cdf0e10cSrcweir const Locale& rLocale, sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32& nDone ) 70*cdf0e10cSrcweir throw(RuntimeException) 71*cdf0e10cSrcweir { 72*cdf0e10cSrcweir if (nCount < 0) throw RuntimeException(); 73*cdf0e10cSrcweir 74*cdf0e10cSrcweir return LBI->previousCharacters( Text, nStartPos, rLocale, nCharacterIteratorMode, nCount, nDone); 75*cdf0e10cSrcweir } 76*cdf0e10cSrcweir 77*cdf0e10cSrcweir #define isZWSP(c) (ch == 0x200B) 78*cdf0e10cSrcweir 79*cdf0e10cSrcweir static sal_Int32 skipSpace(const OUString& Text, sal_Int32 nPos, sal_Int32 len, sal_Int16 rWordType, sal_Bool bDirection) 80*cdf0e10cSrcweir { 81*cdf0e10cSrcweir sal_uInt32 ch=0; 82*cdf0e10cSrcweir sal_Int32 pos=nPos; 83*cdf0e10cSrcweir switch (rWordType) { 84*cdf0e10cSrcweir case WordType::ANYWORD_IGNOREWHITESPACES: 85*cdf0e10cSrcweir if (bDirection) 86*cdf0e10cSrcweir while (nPos < len && (u_isWhitespace(ch = Text.iterateCodePoints(&pos, 1)) || isZWSP(ch))) nPos=pos; 87*cdf0e10cSrcweir else 88*cdf0e10cSrcweir while (nPos > 0 && (u_isWhitespace(ch = Text.iterateCodePoints(&pos, -1)) || isZWSP(ch))) nPos=pos; 89*cdf0e10cSrcweir break; 90*cdf0e10cSrcweir case WordType::DICTIONARY_WORD: 91*cdf0e10cSrcweir if (bDirection) 92*cdf0e10cSrcweir while (nPos < len && (u_isWhitespace(ch = Text.iterateCodePoints(&pos, 1)) || isZWSP(ch) || 93*cdf0e10cSrcweir ! (ch == 0x002E || u_isalnum(ch)))) nPos=pos; 94*cdf0e10cSrcweir else 95*cdf0e10cSrcweir while (nPos > 0 && (u_isWhitespace(ch = Text.iterateCodePoints(&pos, -1)) || isZWSP(ch) || 96*cdf0e10cSrcweir ! (ch == 0x002E || u_isalnum(ch)))) nPos=pos; 97*cdf0e10cSrcweir break; 98*cdf0e10cSrcweir case WordType::WORD_COUNT: 99*cdf0e10cSrcweir if (bDirection) 100*cdf0e10cSrcweir while (nPos < len && (u_isUWhiteSpace(ch = Text.iterateCodePoints(&pos, 1)) || isZWSP(ch))) nPos=pos; 101*cdf0e10cSrcweir else 102*cdf0e10cSrcweir while (nPos > 0 && (u_isUWhiteSpace(ch = Text.iterateCodePoints(&pos, -1)) || isZWSP(ch))) nPos=pos; 103*cdf0e10cSrcweir break; 104*cdf0e10cSrcweir } 105*cdf0e10cSrcweir return nPos; 106*cdf0e10cSrcweir } 107*cdf0e10cSrcweir 108*cdf0e10cSrcweir Boundary SAL_CALL BreakIteratorImpl::nextWord( const OUString& Text, sal_Int32 nStartPos, 109*cdf0e10cSrcweir const Locale& rLocale, sal_Int16 rWordType ) throw(RuntimeException) 110*cdf0e10cSrcweir { 111*cdf0e10cSrcweir sal_Int32 len = Text.getLength(); 112*cdf0e10cSrcweir if( nStartPos < 0 || len == 0 ) 113*cdf0e10cSrcweir result.endPos = result.startPos = 0; 114*cdf0e10cSrcweir else if (nStartPos >= len) 115*cdf0e10cSrcweir result.endPos = result.startPos = len; 116*cdf0e10cSrcweir else { 117*cdf0e10cSrcweir result = LBI->nextWord(Text, nStartPos, rLocale, rWordType); 118*cdf0e10cSrcweir 119*cdf0e10cSrcweir nStartPos = skipSpace(Text, result.startPos, len, rWordType, sal_True); 120*cdf0e10cSrcweir 121*cdf0e10cSrcweir if ( nStartPos != result.startPos) { 122*cdf0e10cSrcweir if( nStartPos >= len ) 123*cdf0e10cSrcweir result.startPos = result.endPos = len; 124*cdf0e10cSrcweir else { 125*cdf0e10cSrcweir result = LBI->getWordBoundary(Text, nStartPos, rLocale, rWordType, sal_True); 126*cdf0e10cSrcweir // i88041: avoid startPos goes back to nStartPos when switching between Latin and CJK scripts 127*cdf0e10cSrcweir if (result.startPos < nStartPos) result.startPos = nStartPos; 128*cdf0e10cSrcweir } 129*cdf0e10cSrcweir } 130*cdf0e10cSrcweir } 131*cdf0e10cSrcweir return result; 132*cdf0e10cSrcweir } 133*cdf0e10cSrcweir 134*cdf0e10cSrcweir static inline sal_Bool SAL_CALL isCJK( const Locale& rLocale ) { 135*cdf0e10cSrcweir return rLocale.Language.equalsAscii("zh") || rLocale.Language.equalsAscii("ja") || rLocale.Language.equalsAscii("ko"); 136*cdf0e10cSrcweir } 137*cdf0e10cSrcweir 138*cdf0e10cSrcweir Boundary SAL_CALL BreakIteratorImpl::previousWord( const OUString& Text, sal_Int32 nStartPos, 139*cdf0e10cSrcweir const Locale& rLocale, sal_Int16 rWordType) throw(RuntimeException) 140*cdf0e10cSrcweir { 141*cdf0e10cSrcweir sal_Int32 len = Text.getLength(); 142*cdf0e10cSrcweir if( nStartPos <= 0 || len == 0 ) { 143*cdf0e10cSrcweir result.endPos = result.startPos = 0; 144*cdf0e10cSrcweir return result; 145*cdf0e10cSrcweir } else if (nStartPos > len) { 146*cdf0e10cSrcweir result.endPos = result.startPos = len; 147*cdf0e10cSrcweir return result; 148*cdf0e10cSrcweir } 149*cdf0e10cSrcweir 150*cdf0e10cSrcweir sal_Int32 nPos = skipSpace(Text, nStartPos, len, rWordType, sal_False); 151*cdf0e10cSrcweir 152*cdf0e10cSrcweir // if some spaces are skiped, and the script type is Asian with no CJK rLocale, we have to return 153*cdf0e10cSrcweir // (nStartPos, -1) for caller to send correct rLocale for loading correct dictionary. 154*cdf0e10cSrcweir result.startPos = nPos; 155*cdf0e10cSrcweir if (nPos != nStartPos && nPos > 0 && !isCJK(rLocale) && getScriptClass(Text.iterateCodePoints(&nPos, -1)) == ScriptType::ASIAN) { 156*cdf0e10cSrcweir result.endPos = -1; 157*cdf0e10cSrcweir return result; 158*cdf0e10cSrcweir } 159*cdf0e10cSrcweir 160*cdf0e10cSrcweir return LBI->previousWord(Text, result.startPos, rLocale, rWordType); 161*cdf0e10cSrcweir } 162*cdf0e10cSrcweir 163*cdf0e10cSrcweir 164*cdf0e10cSrcweir Boundary SAL_CALL BreakIteratorImpl::getWordBoundary( const OUString& Text, sal_Int32 nPos, const Locale& rLocale, 165*cdf0e10cSrcweir sal_Int16 rWordType, sal_Bool bDirection ) throw(RuntimeException) 166*cdf0e10cSrcweir { 167*cdf0e10cSrcweir sal_Int32 len = Text.getLength(); 168*cdf0e10cSrcweir if( nPos < 0 || len == 0 ) 169*cdf0e10cSrcweir result.endPos = result.startPos = 0; 170*cdf0e10cSrcweir else if (nPos > len) 171*cdf0e10cSrcweir result.endPos = result.startPos = len; 172*cdf0e10cSrcweir else { 173*cdf0e10cSrcweir sal_Int32 next, prev; 174*cdf0e10cSrcweir next = skipSpace(Text, nPos, len, rWordType, sal_True); 175*cdf0e10cSrcweir prev = skipSpace(Text, nPos, len, rWordType, sal_False); 176*cdf0e10cSrcweir if (prev == 0 && next == len) { 177*cdf0e10cSrcweir result.endPos = result.startPos = nPos; 178*cdf0e10cSrcweir } else if (prev == 0 && ! bDirection) { 179*cdf0e10cSrcweir result.endPos = result.startPos = 0; 180*cdf0e10cSrcweir } else if (next == len && bDirection) { 181*cdf0e10cSrcweir result.endPos = result.startPos = len; 182*cdf0e10cSrcweir } else { 183*cdf0e10cSrcweir if (next != prev) { 184*cdf0e10cSrcweir if (next == nPos && next != len) 185*cdf0e10cSrcweir bDirection = sal_True; 186*cdf0e10cSrcweir else if (prev == nPos && prev != 0) 187*cdf0e10cSrcweir bDirection = sal_False; 188*cdf0e10cSrcweir else 189*cdf0e10cSrcweir nPos = bDirection ? next : prev; 190*cdf0e10cSrcweir } 191*cdf0e10cSrcweir result = LBI->getWordBoundary(Text, nPos, rLocale, rWordType, bDirection); 192*cdf0e10cSrcweir } 193*cdf0e10cSrcweir } 194*cdf0e10cSrcweir return result; 195*cdf0e10cSrcweir } 196*cdf0e10cSrcweir 197*cdf0e10cSrcweir sal_Bool SAL_CALL BreakIteratorImpl::isBeginWord( const OUString& Text, sal_Int32 nPos, 198*cdf0e10cSrcweir const Locale& rLocale, sal_Int16 rWordType ) throw(RuntimeException) 199*cdf0e10cSrcweir { 200*cdf0e10cSrcweir sal_Int32 len = Text.getLength(); 201*cdf0e10cSrcweir 202*cdf0e10cSrcweir if (nPos < 0 || nPos >= len) return sal_False; 203*cdf0e10cSrcweir 204*cdf0e10cSrcweir sal_Int32 tmp = skipSpace(Text, nPos, len, rWordType, sal_True); 205*cdf0e10cSrcweir 206*cdf0e10cSrcweir if (tmp != nPos) return sal_False; 207*cdf0e10cSrcweir 208*cdf0e10cSrcweir result = getWordBoundary(Text, nPos, rLocale, rWordType, sal_True); 209*cdf0e10cSrcweir 210*cdf0e10cSrcweir return result.startPos == nPos; 211*cdf0e10cSrcweir } 212*cdf0e10cSrcweir 213*cdf0e10cSrcweir sal_Bool SAL_CALL BreakIteratorImpl::isEndWord( const OUString& Text, sal_Int32 nPos, 214*cdf0e10cSrcweir const Locale& rLocale, sal_Int16 rWordType ) throw(RuntimeException) 215*cdf0e10cSrcweir { 216*cdf0e10cSrcweir sal_Int32 len = Text.getLength(); 217*cdf0e10cSrcweir 218*cdf0e10cSrcweir if (nPos <= 0 || nPos > len) return sal_False; 219*cdf0e10cSrcweir 220*cdf0e10cSrcweir sal_Int32 tmp = skipSpace(Text, nPos, len, rWordType, sal_False); 221*cdf0e10cSrcweir 222*cdf0e10cSrcweir if (tmp != nPos) return sal_False; 223*cdf0e10cSrcweir 224*cdf0e10cSrcweir result = getWordBoundary(Text, nPos, rLocale, rWordType, sal_False); 225*cdf0e10cSrcweir 226*cdf0e10cSrcweir return result.endPos == nPos; 227*cdf0e10cSrcweir } 228*cdf0e10cSrcweir 229*cdf0e10cSrcweir sal_Int32 SAL_CALL BreakIteratorImpl::beginOfSentence( const OUString& Text, sal_Int32 nStartPos, 230*cdf0e10cSrcweir const Locale &rLocale ) throw(RuntimeException) 231*cdf0e10cSrcweir { 232*cdf0e10cSrcweir if (nStartPos < 0 || nStartPos > Text.getLength()) 233*cdf0e10cSrcweir return -1; 234*cdf0e10cSrcweir if (Text.getLength() == 0) return 0; 235*cdf0e10cSrcweir return LBI->beginOfSentence(Text, nStartPos, rLocale); 236*cdf0e10cSrcweir } 237*cdf0e10cSrcweir 238*cdf0e10cSrcweir sal_Int32 SAL_CALL BreakIteratorImpl::endOfSentence( const OUString& Text, sal_Int32 nStartPos, 239*cdf0e10cSrcweir const Locale &rLocale ) throw(RuntimeException) 240*cdf0e10cSrcweir { 241*cdf0e10cSrcweir if (nStartPos < 0 || nStartPos > Text.getLength()) 242*cdf0e10cSrcweir return -1; 243*cdf0e10cSrcweir if (Text.getLength() == 0) return 0; 244*cdf0e10cSrcweir return LBI->endOfSentence(Text, nStartPos, rLocale); 245*cdf0e10cSrcweir } 246*cdf0e10cSrcweir 247*cdf0e10cSrcweir LineBreakResults SAL_CALL BreakIteratorImpl::getLineBreak( const OUString& Text, sal_Int32 nStartPos, 248*cdf0e10cSrcweir const Locale& rLocale, sal_Int32 nMinBreakPos, const LineBreakHyphenationOptions& hOptions, 249*cdf0e10cSrcweir const LineBreakUserOptions& bOptions ) throw(RuntimeException) 250*cdf0e10cSrcweir { 251*cdf0e10cSrcweir return LBI->getLineBreak(Text, nStartPos, rLocale, nMinBreakPos, hOptions, bOptions); 252*cdf0e10cSrcweir } 253*cdf0e10cSrcweir 254*cdf0e10cSrcweir sal_Int16 SAL_CALL BreakIteratorImpl::getScriptType( const OUString& Text, sal_Int32 nPos ) 255*cdf0e10cSrcweir throw(RuntimeException) 256*cdf0e10cSrcweir { 257*cdf0e10cSrcweir return (nPos < 0 || nPos >= Text.getLength()) ? ScriptType::WEAK : 258*cdf0e10cSrcweir getScriptClass(Text.iterateCodePoints(&nPos, 0)); 259*cdf0e10cSrcweir } 260*cdf0e10cSrcweir 261*cdf0e10cSrcweir 262*cdf0e10cSrcweir /** Increments/decrements position first, then obtains character. 263*cdf0e10cSrcweir @return current position, may be -1 or text length if string was consumed. 264*cdf0e10cSrcweir */ 265*cdf0e10cSrcweir static sal_Int32 SAL_CALL iterateCodePoints(const OUString& Text, sal_Int32 &nStartPos, sal_Int32 inc, sal_uInt32& ch) { 266*cdf0e10cSrcweir sal_Int32 nLen = Text.getLength(); 267*cdf0e10cSrcweir if (nStartPos + inc < 0 || nStartPos + inc >= nLen) { 268*cdf0e10cSrcweir ch = 0; 269*cdf0e10cSrcweir nStartPos = nStartPos + inc < 0 ? -1 : nLen; 270*cdf0e10cSrcweir } else { 271*cdf0e10cSrcweir ch = Text.iterateCodePoints(&nStartPos, inc); 272*cdf0e10cSrcweir // Fix for #i80436#. 273*cdf0e10cSrcweir // erAck: 2009-06-30T21:52+0200 This logic looks somewhat 274*cdf0e10cSrcweir // suspicious as if it cures a symptom.. anyway, had to add 275*cdf0e10cSrcweir // nStartPos < Text.getLength() to silence the (correct) assertion 276*cdf0e10cSrcweir // in rtl_uString_iterateCodePoints() if Text was one character 277*cdf0e10cSrcweir // (codepoint) only, made up of a surrogate pair. 278*cdf0e10cSrcweir //if (inc > 0 && nStartPos < Text.getLength()) 279*cdf0e10cSrcweir // ch = Text.iterateCodePoints(&nStartPos, 0); 280*cdf0e10cSrcweir // With surrogates, nStartPos may actually point behind string 281*cdf0e10cSrcweir // now, even if inc is only +1 282*cdf0e10cSrcweir if (inc > 0) 283*cdf0e10cSrcweir ch = (nStartPos < nLen ? Text.iterateCodePoints(&nStartPos, 0) : 0); 284*cdf0e10cSrcweir } 285*cdf0e10cSrcweir return nStartPos; 286*cdf0e10cSrcweir } 287*cdf0e10cSrcweir 288*cdf0e10cSrcweir 289*cdf0e10cSrcweir sal_Int32 SAL_CALL BreakIteratorImpl::beginOfScript( const OUString& Text, 290*cdf0e10cSrcweir sal_Int32 nStartPos, sal_Int16 ScriptType ) throw(RuntimeException) 291*cdf0e10cSrcweir { 292*cdf0e10cSrcweir if (nStartPos < 0 || nStartPos >= Text.getLength()) 293*cdf0e10cSrcweir return -1; 294*cdf0e10cSrcweir 295*cdf0e10cSrcweir if(ScriptType != getScriptClass(Text.iterateCodePoints(&nStartPos, 0))) 296*cdf0e10cSrcweir return -1; 297*cdf0e10cSrcweir 298*cdf0e10cSrcweir if (nStartPos == 0) return 0; 299*cdf0e10cSrcweir sal_uInt32 ch=0; 300*cdf0e10cSrcweir while (iterateCodePoints(Text, nStartPos, -1, ch) >= 0 && ScriptType == getScriptClass(ch)) { 301*cdf0e10cSrcweir if (nStartPos == 0) return 0; 302*cdf0e10cSrcweir } 303*cdf0e10cSrcweir 304*cdf0e10cSrcweir return iterateCodePoints(Text, nStartPos, 1, ch); 305*cdf0e10cSrcweir } 306*cdf0e10cSrcweir 307*cdf0e10cSrcweir sal_Int32 SAL_CALL BreakIteratorImpl::endOfScript( const OUString& Text, 308*cdf0e10cSrcweir sal_Int32 nStartPos, sal_Int16 ScriptType ) throw(RuntimeException) 309*cdf0e10cSrcweir { 310*cdf0e10cSrcweir if (nStartPos < 0 || nStartPos >= Text.getLength()) 311*cdf0e10cSrcweir return -1; 312*cdf0e10cSrcweir 313*cdf0e10cSrcweir if(ScriptType != getScriptClass(Text.iterateCodePoints(&nStartPos, 0))) 314*cdf0e10cSrcweir return -1; 315*cdf0e10cSrcweir 316*cdf0e10cSrcweir sal_Int32 strLen = Text.getLength(); 317*cdf0e10cSrcweir sal_uInt32 ch=0; 318*cdf0e10cSrcweir while(iterateCodePoints(Text, nStartPos, 1, ch) < strLen ) { 319*cdf0e10cSrcweir sal_Int16 currentCharScriptType = getScriptClass(ch); 320*cdf0e10cSrcweir if(ScriptType != currentCharScriptType && currentCharScriptType != ScriptType::WEAK) 321*cdf0e10cSrcweir break; 322*cdf0e10cSrcweir } 323*cdf0e10cSrcweir return nStartPos; 324*cdf0e10cSrcweir } 325*cdf0e10cSrcweir 326*cdf0e10cSrcweir sal_Int32 SAL_CALL BreakIteratorImpl::previousScript( const OUString& Text, 327*cdf0e10cSrcweir sal_Int32 nStartPos, sal_Int16 ScriptType ) throw(RuntimeException) 328*cdf0e10cSrcweir { 329*cdf0e10cSrcweir if (nStartPos < 0) 330*cdf0e10cSrcweir return -1; 331*cdf0e10cSrcweir if (nStartPos > Text.getLength()) 332*cdf0e10cSrcweir nStartPos = Text.getLength(); 333*cdf0e10cSrcweir 334*cdf0e10cSrcweir sal_Int16 numberOfChange = (ScriptType == getScriptClass(Text.iterateCodePoints(&nStartPos, 0))) ? 3 : 2; 335*cdf0e10cSrcweir 336*cdf0e10cSrcweir sal_uInt32 ch=0; 337*cdf0e10cSrcweir while (numberOfChange > 0 && iterateCodePoints(Text, nStartPos, -1, ch) >= 0) { 338*cdf0e10cSrcweir if ((((numberOfChange % 2) == 0) ^ (ScriptType != getScriptClass(ch)))) 339*cdf0e10cSrcweir numberOfChange--; 340*cdf0e10cSrcweir else if (nStartPos == 0) { 341*cdf0e10cSrcweir if (numberOfChange > 0) 342*cdf0e10cSrcweir numberOfChange--; 343*cdf0e10cSrcweir if (nStartPos > 0) 344*cdf0e10cSrcweir Text.iterateCodePoints(&nStartPos, -1); 345*cdf0e10cSrcweir else 346*cdf0e10cSrcweir return -1; 347*cdf0e10cSrcweir } 348*cdf0e10cSrcweir } 349*cdf0e10cSrcweir return numberOfChange == 0 ? iterateCodePoints(Text, nStartPos, 1, ch) : -1; 350*cdf0e10cSrcweir } 351*cdf0e10cSrcweir 352*cdf0e10cSrcweir sal_Int32 SAL_CALL BreakIteratorImpl::nextScript( const OUString& Text, sal_Int32 nStartPos, 353*cdf0e10cSrcweir sal_Int16 ScriptType ) throw(RuntimeException) 354*cdf0e10cSrcweir 355*cdf0e10cSrcweir { 356*cdf0e10cSrcweir if (nStartPos < 0) 357*cdf0e10cSrcweir nStartPos = 0; 358*cdf0e10cSrcweir sal_Int32 strLen = Text.getLength(); 359*cdf0e10cSrcweir if (nStartPos > strLen) 360*cdf0e10cSrcweir return -1; 361*cdf0e10cSrcweir 362*cdf0e10cSrcweir sal_Int16 numberOfChange = (ScriptType == getScriptClass(Text.iterateCodePoints(&nStartPos, 0))) ? 2 : 1; 363*cdf0e10cSrcweir 364*cdf0e10cSrcweir sal_uInt32 ch=0; 365*cdf0e10cSrcweir while (numberOfChange > 0 && iterateCodePoints(Text, nStartPos, 1, ch) < strLen) { 366*cdf0e10cSrcweir sal_Int16 currentCharScriptType = getScriptClass(ch); 367*cdf0e10cSrcweir if ((numberOfChange == 1) ? (ScriptType == currentCharScriptType) : 368*cdf0e10cSrcweir (ScriptType != currentCharScriptType && currentCharScriptType != ScriptType::WEAK)) 369*cdf0e10cSrcweir numberOfChange--; 370*cdf0e10cSrcweir } 371*cdf0e10cSrcweir return numberOfChange == 0 ? nStartPos : -1; 372*cdf0e10cSrcweir } 373*cdf0e10cSrcweir 374*cdf0e10cSrcweir sal_Int32 SAL_CALL BreakIteratorImpl::beginOfCharBlock( const OUString& Text, sal_Int32 nStartPos, 375*cdf0e10cSrcweir const Locale& /*rLocale*/, sal_Int16 CharType ) throw(RuntimeException) 376*cdf0e10cSrcweir { 377*cdf0e10cSrcweir if (CharType == CharType::ANY_CHAR) return 0; 378*cdf0e10cSrcweir if (nStartPos < 0 || nStartPos >= Text.getLength()) return -1; 379*cdf0e10cSrcweir if (CharType != (sal_Int16)u_charType( Text.iterateCodePoints(&nStartPos, 0))) return -1; 380*cdf0e10cSrcweir 381*cdf0e10cSrcweir sal_Int32 nPos=nStartPos; 382*cdf0e10cSrcweir while(nStartPos > 0 && CharType == (sal_Int16)u_charType(Text.iterateCodePoints(&nPos, -1))) { nStartPos=nPos; } 383*cdf0e10cSrcweir return nStartPos; // begin of char block is inclusive 384*cdf0e10cSrcweir } 385*cdf0e10cSrcweir 386*cdf0e10cSrcweir sal_Int32 SAL_CALL BreakIteratorImpl::endOfCharBlock( const OUString& Text, sal_Int32 nStartPos, 387*cdf0e10cSrcweir const Locale& /*rLocale*/, sal_Int16 CharType ) throw(RuntimeException) 388*cdf0e10cSrcweir { 389*cdf0e10cSrcweir sal_Int32 strLen = Text.getLength(); 390*cdf0e10cSrcweir 391*cdf0e10cSrcweir if (CharType == CharType::ANY_CHAR) return strLen; // end of char block is exclusive 392*cdf0e10cSrcweir if (nStartPos < 0 || nStartPos >= strLen) return -1; 393*cdf0e10cSrcweir if (CharType != (sal_Int16)u_charType(Text.iterateCodePoints(&nStartPos, 0))) return -1; 394*cdf0e10cSrcweir 395*cdf0e10cSrcweir sal_uInt32 ch=0; 396*cdf0e10cSrcweir while(iterateCodePoints(Text, nStartPos, 1, ch) < strLen && CharType == (sal_Int16)u_charType(ch)) {} 397*cdf0e10cSrcweir return nStartPos; // end of char block is exclusive 398*cdf0e10cSrcweir } 399*cdf0e10cSrcweir 400*cdf0e10cSrcweir sal_Int32 SAL_CALL BreakIteratorImpl::nextCharBlock( const OUString& Text, sal_Int32 nStartPos, 401*cdf0e10cSrcweir const Locale& /*rLocale*/, sal_Int16 CharType ) throw(RuntimeException) 402*cdf0e10cSrcweir { 403*cdf0e10cSrcweir if (CharType == CharType::ANY_CHAR) return -1; 404*cdf0e10cSrcweir if (nStartPos < 0 || nStartPos >= Text.getLength()) return -1; 405*cdf0e10cSrcweir 406*cdf0e10cSrcweir sal_Int16 numberOfChange = (CharType == (sal_Int16)u_charType(Text.iterateCodePoints(&nStartPos, 0))) ? 2 : 1; 407*cdf0e10cSrcweir sal_Int32 strLen = Text.getLength(); 408*cdf0e10cSrcweir 409*cdf0e10cSrcweir sal_uInt32 ch=0; 410*cdf0e10cSrcweir while (numberOfChange > 0 && iterateCodePoints(Text, nStartPos, 1, ch) < strLen) { 411*cdf0e10cSrcweir if ((CharType != (sal_Int16)u_charType(ch)) ^ (numberOfChange == 1)) 412*cdf0e10cSrcweir numberOfChange--; 413*cdf0e10cSrcweir } 414*cdf0e10cSrcweir return numberOfChange == 0 ? nStartPos : -1; 415*cdf0e10cSrcweir } 416*cdf0e10cSrcweir 417*cdf0e10cSrcweir sal_Int32 SAL_CALL BreakIteratorImpl::previousCharBlock( const OUString& Text, sal_Int32 nStartPos, 418*cdf0e10cSrcweir const Locale& /*rLocale*/, sal_Int16 CharType ) throw(RuntimeException) 419*cdf0e10cSrcweir { 420*cdf0e10cSrcweir if(CharType == CharType::ANY_CHAR) return -1; 421*cdf0e10cSrcweir if (nStartPos < 0 || nStartPos >= Text.getLength()) return -1; 422*cdf0e10cSrcweir 423*cdf0e10cSrcweir sal_Int16 numberOfChange = (CharType == (sal_Int16)u_charType(Text.iterateCodePoints(&nStartPos, 0))) ? 3 : 2; 424*cdf0e10cSrcweir 425*cdf0e10cSrcweir sal_uInt32 ch=0; 426*cdf0e10cSrcweir while (numberOfChange > 0 && iterateCodePoints(Text, nStartPos, -1, ch) >= 0) { 427*cdf0e10cSrcweir if (((numberOfChange % 2) == 0) ^ (CharType != (sal_Int16)u_charType(ch))) 428*cdf0e10cSrcweir numberOfChange--; 429*cdf0e10cSrcweir if (nStartPos == 0 && numberOfChange > 0) { 430*cdf0e10cSrcweir numberOfChange--; 431*cdf0e10cSrcweir if (numberOfChange == 0) return nStartPos; 432*cdf0e10cSrcweir } 433*cdf0e10cSrcweir } 434*cdf0e10cSrcweir return numberOfChange == 0 ? iterateCodePoints(Text, nStartPos, 1, ch) : -1; 435*cdf0e10cSrcweir } 436*cdf0e10cSrcweir 437*cdf0e10cSrcweir 438*cdf0e10cSrcweir 439*cdf0e10cSrcweir sal_Int16 SAL_CALL BreakIteratorImpl::getWordType( const OUString& /*Text*/, 440*cdf0e10cSrcweir sal_Int32 /*nPos*/, const Locale& /*rLocale*/ ) throw(RuntimeException) 441*cdf0e10cSrcweir { 442*cdf0e10cSrcweir return 0; 443*cdf0e10cSrcweir } 444*cdf0e10cSrcweir 445*cdf0e10cSrcweir typedef struct { 446*cdf0e10cSrcweir UBlockCode from; 447*cdf0e10cSrcweir UBlockCode to; 448*cdf0e10cSrcweir sal_Int16 script; 449*cdf0e10cSrcweir } UBlock2Script; 450*cdf0e10cSrcweir 451*cdf0e10cSrcweir // for a list of the UBLOCK_... values see: 452*cdf0e10cSrcweir // http://icu-project.org/apiref/icu4c/uchar_8h.html 453*cdf0e10cSrcweir // where enum UBlockCode is defined. 454*cdf0e10cSrcweir // See also http://www.unicode.org/charts/ for general reference 455*cdf0e10cSrcweir static UBlock2Script scriptList[] = { 456*cdf0e10cSrcweir {UBLOCK_NO_BLOCK, UBLOCK_NO_BLOCK, ScriptType::WEAK}, 457*cdf0e10cSrcweir {UBLOCK_BASIC_LATIN, UBLOCK_ARMENIAN, ScriptType::LATIN}, 458*cdf0e10cSrcweir {UBLOCK_HEBREW, UBLOCK_MYANMAR, ScriptType::COMPLEX}, 459*cdf0e10cSrcweir {UBLOCK_GEORGIAN, UBLOCK_GEORGIAN, ScriptType::LATIN}, 460*cdf0e10cSrcweir {UBLOCK_HANGUL_JAMO, UBLOCK_HANGUL_JAMO, ScriptType::ASIAN}, 461*cdf0e10cSrcweir {UBLOCK_ETHIOPIC, UBLOCK_ETHIOPIC, ScriptType::COMPLEX}, 462*cdf0e10cSrcweir {UBLOCK_CHEROKEE, UBLOCK_RUNIC, ScriptType::LATIN}, 463*cdf0e10cSrcweir {UBLOCK_KHMER, UBLOCK_MONGOLIAN, ScriptType::COMPLEX}, 464*cdf0e10cSrcweir {UBLOCK_LATIN_EXTENDED_ADDITIONAL, UBLOCK_GREEK_EXTENDED, ScriptType::LATIN}, 465*cdf0e10cSrcweir {UBLOCK_CJK_RADICALS_SUPPLEMENT, UBLOCK_HANGUL_SYLLABLES, ScriptType::ASIAN}, 466*cdf0e10cSrcweir {UBLOCK_CJK_COMPATIBILITY_IDEOGRAPHS, UBLOCK_CJK_COMPATIBILITY_IDEOGRAPHS, ScriptType::ASIAN}, 467*cdf0e10cSrcweir {UBLOCK_ARABIC_PRESENTATION_FORMS_A, UBLOCK_ARABIC_PRESENTATION_FORMS_A, ScriptType::COMPLEX}, 468*cdf0e10cSrcweir {UBLOCK_CJK_COMPATIBILITY_FORMS, UBLOCK_CJK_COMPATIBILITY_FORMS, ScriptType::ASIAN}, 469*cdf0e10cSrcweir {UBLOCK_ARABIC_PRESENTATION_FORMS_B, UBLOCK_ARABIC_PRESENTATION_FORMS_B, ScriptType::COMPLEX}, 470*cdf0e10cSrcweir {UBLOCK_HALFWIDTH_AND_FULLWIDTH_FORMS, UBLOCK_HALFWIDTH_AND_FULLWIDTH_FORMS, ScriptType::ASIAN}, 471*cdf0e10cSrcweir {UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B, UBLOCK_CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT, ScriptType::ASIAN}, 472*cdf0e10cSrcweir {UBLOCK_CJK_STROKES, UBLOCK_CJK_STROKES, ScriptType::ASIAN}, 473*cdf0e10cSrcweir {UBLOCK_LATIN_EXTENDED_C, UBLOCK_LATIN_EXTENDED_D, ScriptType::LATIN} 474*cdf0e10cSrcweir }; 475*cdf0e10cSrcweir 476*cdf0e10cSrcweir #define scriptListCount sizeof (scriptList) / sizeof (UBlock2Script) 477*cdf0e10cSrcweir 478*cdf0e10cSrcweir sal_Int16 BreakIteratorImpl::getScriptClass(sal_uInt32 currentChar) 479*cdf0e10cSrcweir { 480*cdf0e10cSrcweir static sal_uInt32 lastChar = 0; 481*cdf0e10cSrcweir static sal_Int16 nRet = 0; 482*cdf0e10cSrcweir 483*cdf0e10cSrcweir if (currentChar != lastChar) { 484*cdf0e10cSrcweir lastChar = currentChar; 485*cdf0e10cSrcweir 486*cdf0e10cSrcweir //JP 21.9.2001: handle specific characters - always as weak 487*cdf0e10cSrcweir // definition of 1 - this breaks a word 488*cdf0e10cSrcweir // 2 - this can be inside a word 489*cdf0e10cSrcweir // 0x20 & 0xA0 - Bug 102975, declare western space and non-break space as WEAK char. 490*cdf0e10cSrcweir if( 1 == currentChar || 2 == currentChar || 0x20 == currentChar || 0xA0 == currentChar) 491*cdf0e10cSrcweir nRet = ScriptType::WEAK; 492*cdf0e10cSrcweir // workaround for Coptic 493*cdf0e10cSrcweir else if ( 0x2C80 <= currentChar && 0x2CE3 >= currentChar) 494*cdf0e10cSrcweir nRet = ScriptType::LATIN; 495*cdf0e10cSrcweir // work-around for ligatures (see http://www.unicode.org/charts/PDF/UFB00.pdf) 496*cdf0e10cSrcweir else if ((0xFB00 <= currentChar && currentChar <= 0xFB06) || 497*cdf0e10cSrcweir (0xFB13 <= currentChar && currentChar <= 0xFB17)) 498*cdf0e10cSrcweir nRet = ScriptType::LATIN; 499*cdf0e10cSrcweir else { 500*cdf0e10cSrcweir UBlockCode block=ublock_getCode(currentChar); 501*cdf0e10cSrcweir sal_uInt16 i; 502*cdf0e10cSrcweir for ( i = 0; i < scriptListCount; i++) { 503*cdf0e10cSrcweir if (block <= scriptList[i].to) break; 504*cdf0e10cSrcweir } 505*cdf0e10cSrcweir nRet=(i < scriptListCount && block >= scriptList[i].from) ? scriptList[i].script : ScriptType::WEAK; 506*cdf0e10cSrcweir } 507*cdf0e10cSrcweir } 508*cdf0e10cSrcweir return nRet; 509*cdf0e10cSrcweir } 510*cdf0e10cSrcweir 511*cdf0e10cSrcweir static inline sal_Bool operator == (const Locale& l1, const Locale& l2) { 512*cdf0e10cSrcweir return l1.Language == l2.Language && l1.Country == l2.Country && l1.Variant == l2.Variant; 513*cdf0e10cSrcweir } 514*cdf0e10cSrcweir 515*cdf0e10cSrcweir sal_Bool SAL_CALL BreakIteratorImpl::createLocaleSpecificBreakIterator(const OUString& aLocaleName) throw( RuntimeException ) 516*cdf0e10cSrcweir { 517*cdf0e10cSrcweir // to share service between same Language but different Country code, like zh_CN and zh_TW 518*cdf0e10cSrcweir for (size_t l = 0; l < lookupTable.size(); l++) { 519*cdf0e10cSrcweir lookupTableItem *listItem = lookupTable[l]; 520*cdf0e10cSrcweir if (aLocaleName == listItem->aLocale.Language) { 521*cdf0e10cSrcweir xBI = listItem->xBI; 522*cdf0e10cSrcweir return sal_True; 523*cdf0e10cSrcweir } 524*cdf0e10cSrcweir } 525*cdf0e10cSrcweir 526*cdf0e10cSrcweir Reference < uno::XInterface > xI = xMSF->createInstance( 527*cdf0e10cSrcweir OUString::createFromAscii("com.sun.star.i18n.BreakIterator_") + aLocaleName); 528*cdf0e10cSrcweir 529*cdf0e10cSrcweir if ( xI.is() ) { 530*cdf0e10cSrcweir xI->queryInterface( getCppuType((const Reference< XBreakIterator>*)0) ) >>= xBI; 531*cdf0e10cSrcweir if (xBI.is()) { 532*cdf0e10cSrcweir lookupTable.push_back(new lookupTableItem(Locale(aLocaleName, aLocaleName, aLocaleName), xBI)); 533*cdf0e10cSrcweir return sal_True; 534*cdf0e10cSrcweir } 535*cdf0e10cSrcweir } 536*cdf0e10cSrcweir return sal_False; 537*cdf0e10cSrcweir } 538*cdf0e10cSrcweir 539*cdf0e10cSrcweir Reference < XBreakIterator > SAL_CALL 540*cdf0e10cSrcweir BreakIteratorImpl::getLocaleSpecificBreakIterator(const Locale& rLocale) throw (RuntimeException) 541*cdf0e10cSrcweir { 542*cdf0e10cSrcweir if (xBI.is() && rLocale == aLocale) 543*cdf0e10cSrcweir return xBI; 544*cdf0e10cSrcweir else if (xMSF.is()) { 545*cdf0e10cSrcweir aLocale = rLocale; 546*cdf0e10cSrcweir 547*cdf0e10cSrcweir for (size_t i = 0; i < lookupTable.size(); i++) { 548*cdf0e10cSrcweir lookupTableItem *listItem = lookupTable[i]; 549*cdf0e10cSrcweir if (rLocale == listItem->aLocale) 550*cdf0e10cSrcweir return xBI = listItem->xBI; 551*cdf0e10cSrcweir } 552*cdf0e10cSrcweir 553*cdf0e10cSrcweir sal_Unicode under = (sal_Unicode)'_'; 554*cdf0e10cSrcweir 555*cdf0e10cSrcweir sal_Int32 l = rLocale.Language.getLength(); 556*cdf0e10cSrcweir sal_Int32 c = rLocale.Country.getLength(); 557*cdf0e10cSrcweir sal_Int32 v = rLocale.Variant.getLength(); 558*cdf0e10cSrcweir OUStringBuffer aBuf(l+c+v+3); 559*cdf0e10cSrcweir 560*cdf0e10cSrcweir if ((l > 0 && c > 0 && v > 0 && 561*cdf0e10cSrcweir // load service with name <base>_<lang>_<country>_<varian> 562*cdf0e10cSrcweir createLocaleSpecificBreakIterator(aBuf.append(rLocale.Language).append(under).append( 563*cdf0e10cSrcweir rLocale.Country).append(under).append(rLocale.Variant).makeStringAndClear())) || 564*cdf0e10cSrcweir (l > 0 && c > 0 && 565*cdf0e10cSrcweir // load service with name <base>_<lang>_<country> 566*cdf0e10cSrcweir createLocaleSpecificBreakIterator(aBuf.append(rLocale.Language).append(under).append( 567*cdf0e10cSrcweir rLocale.Country).makeStringAndClear())) || 568*cdf0e10cSrcweir (l > 0 && c > 0 && rLocale.Language.compareToAscii("zh") == 0 && 569*cdf0e10cSrcweir (rLocale.Country.compareToAscii("HK") == 0 || 570*cdf0e10cSrcweir rLocale.Country.compareToAscii("MO") == 0) && 571*cdf0e10cSrcweir // if the country code is HK or MO, one more step to try TW. 572*cdf0e10cSrcweir createLocaleSpecificBreakIterator(aBuf.append(rLocale.Language).append(under).appendAscii( 573*cdf0e10cSrcweir "TW").makeStringAndClear())) || 574*cdf0e10cSrcweir (l > 0 && 575*cdf0e10cSrcweir // load service with name <base>_<lang> 576*cdf0e10cSrcweir createLocaleSpecificBreakIterator(rLocale.Language)) || 577*cdf0e10cSrcweir // load default service with name <base>_Unicode 578*cdf0e10cSrcweir createLocaleSpecificBreakIterator(OUString::createFromAscii("Unicode"))) { 579*cdf0e10cSrcweir lookupTable.push_back( new lookupTableItem(aLocale, xBI) ); 580*cdf0e10cSrcweir return xBI; 581*cdf0e10cSrcweir } 582*cdf0e10cSrcweir } 583*cdf0e10cSrcweir throw RuntimeException(); 584*cdf0e10cSrcweir } 585*cdf0e10cSrcweir 586*cdf0e10cSrcweir const sal_Char cBreakIterator[] = "com.sun.star.i18n.BreakIterator"; 587*cdf0e10cSrcweir 588*cdf0e10cSrcweir OUString SAL_CALL 589*cdf0e10cSrcweir BreakIteratorImpl::getImplementationName(void) throw( RuntimeException ) 590*cdf0e10cSrcweir { 591*cdf0e10cSrcweir return OUString::createFromAscii(cBreakIterator); 592*cdf0e10cSrcweir } 593*cdf0e10cSrcweir 594*cdf0e10cSrcweir sal_Bool SAL_CALL 595*cdf0e10cSrcweir BreakIteratorImpl::supportsService(const OUString& rServiceName) throw( RuntimeException ) 596*cdf0e10cSrcweir { 597*cdf0e10cSrcweir return !rServiceName.compareToAscii(cBreakIterator); 598*cdf0e10cSrcweir } 599*cdf0e10cSrcweir 600*cdf0e10cSrcweir Sequence< OUString > SAL_CALL 601*cdf0e10cSrcweir BreakIteratorImpl::getSupportedServiceNames(void) throw( RuntimeException ) 602*cdf0e10cSrcweir { 603*cdf0e10cSrcweir Sequence< OUString > aRet(1); 604*cdf0e10cSrcweir aRet[0] = OUString::createFromAscii(cBreakIterator); 605*cdf0e10cSrcweir return aRet; 606*cdf0e10cSrcweir } 607*cdf0e10cSrcweir 608*cdf0e10cSrcweir } } } } 609*cdf0e10cSrcweir 610