1 /************************************************************** 2 * 3 * Licensed to the Apache Software Foundation (ASF) under one 4 * or more contributor license agreements. See the NOTICE file 5 * distributed with this work for additional information 6 * regarding copyright ownership. The ASF licenses this file 7 * to you under the Apache License, Version 2.0 (the 8 * "License"); you may not use this file except in compliance 9 * with the License. You may obtain a copy of the License at 10 * 11 * http://www.apache.org/licenses/LICENSE-2.0 12 * 13 * Unless required by applicable law or agreed to in writing, 14 * software distributed under the License is distributed on an 15 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 * KIND, either express or implied. See the License for the 17 * specific language governing permissions and limitations 18 * under the License. 19 * 20 *************************************************************/ 21 22 23 24 // MARKER(update_precomp.py): autogen include statement, do not remove 25 #include "precompiled_i18npool.hxx" 26 27 // xdictionary.cpp: implementation of the xdictionary class. 28 // 29 ////////////////////////////////////////////////////////////////////// 30 31 32 #include <rtl/ustrbuf.hxx> 33 34 #include <com/sun/star/i18n/WordType.hpp> 35 #include <xdictionary.hxx> 36 #include <unicode/uchar.h> 37 #include <string.h> 38 #include <breakiteratorImpl.hxx> 39 40 ////////////////////////////////////////////////////////////////////// 41 // Construction/Destruction 42 ////////////////////////////////////////////////////////////////////// 43 44 using namespace rtl; 45 46 namespace com { namespace sun { namespace star { namespace i18n { 47 48 extern "C" { static void SAL_CALL thisModule() {} } 49 50 xdictionary::xdictionary(const sal_Char *lang) : 51 existMark( NULL ), 52 index1( NULL ), 53 index2( NULL ), 54 lenArray( NULL ), 55 dataArea( NULL ), 56 hModule( NULL ), 57 boundary(), 58 japaneseWordBreak( sal_False ) 59 #if USE_CELL_BOUNDARY_CODE 60 // For CTL breakiterator, where the word boundary should not be inside cell. 61 , 62 useCellBoundary( sal_False ), 63 cellBoundary( NULL ) 64 #endif 65 { 66 index1 = 0; 67 #ifdef SAL_DLLPREFIX 68 OUStringBuffer aBuf( strlen(lang) + 7 + 6 ); // mostly "lib*.so" (with * == dict_zh) 69 aBuf.appendAscii( SAL_DLLPREFIX ); 70 #else 71 OUStringBuffer aBuf( strlen(lang) + 7 + 4 ); // mostly "*.dll" (with * == dict_zh) 72 #endif 73 aBuf.appendAscii( "dict_" ).appendAscii( lang ).appendAscii( SAL_DLLEXTENSION ); 74 hModule = osl_loadModuleRelative( &thisModule, aBuf.makeStringAndClear().pData, SAL_LOADMODULE_DEFAULT ); 75 if( hModule ) { 76 sal_IntPtr (*func)(); 77 func = (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule, OUString::createFromAscii("getExistMark").pData ); 78 existMark = (sal_uInt8*) (*func)(); 79 func = (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule, OUString::createFromAscii("getIndex1").pData ); 80 index1 = (sal_Int16*) (*func)(); 81 func = (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule, OUString::createFromAscii("getIndex2").pData ); 82 index2 = (sal_Int32*) (*func)(); 83 func = (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule, OUString::createFromAscii("getLenArray").pData ); 84 lenArray = (sal_Int32*) (*func)(); 85 func = (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule, OUString::createFromAscii("getDataArea").pData ); 86 dataArea = (sal_Unicode*) (*func)(); 87 } 88 else 89 { 90 existMark = NULL; 91 index1 = NULL; 92 index2 = NULL; 93 lenArray = NULL; 94 dataArea = NULL; 95 } 96 97 for (sal_Int32 i = 0; i < CACHE_MAX; i++) 98 cache[i].size = 0; 99 100 #if USE_CELL_BOUNDARY_CODE 101 useCellBoundary = sal_False; 102 cellBoundary = NULL; 103 #endif 104 japaneseWordBreak = sal_False; 105 } 106 107 xdictionary::~xdictionary() { 108 osl_unloadModule(hModule); 109 for (sal_Int32 i = 0; i < CACHE_MAX; i++) { 110 if (cache[i].size > 0) { 111 delete cache[i].contents; 112 delete cache[i].wordboundary; 113 } 114 } 115 } 116 117 void xdictionary::setJapaneseWordBreak() 118 { 119 japaneseWordBreak = sal_True; 120 } 121 122 sal_Bool xdictionary::exists(const sal_uInt32 c) { 123 // 0x1FFF is the hardcoded limit in gendict for existMarks 124 sal_Bool exist = (existMark && ((c>>3) < 0x1FFF)) ? sal::static_int_cast<sal_Bool>((existMark[c>>3] & (1<<(c&0x07))) != 0) : sal_False; 125 if (!exist && japaneseWordBreak) 126 return BreakIteratorImpl::getScriptClass(c) == ScriptType::ASIAN; 127 else 128 return exist; 129 } 130 131 sal_Int32 xdictionary::getLongestMatch(const sal_Unicode* str, sal_Int32 sLen) { 132 133 if ( !index1 ) return 0; 134 135 sal_Int16 idx = index1[str[0] >> 8]; 136 137 if (idx == 0xFF) return 0; 138 139 idx = (idx<<8) | (str[0]&0xff); 140 141 sal_uInt32 begin = index2[idx], end = index2[idx+1]; 142 143 if (begin == 0) return 0; 144 145 str++; sLen--; // first character is not stored in the dictionary 146 for (sal_uInt32 i = end; i > begin; i--) { 147 sal_Int32 len = lenArray[i] - lenArray[i - 1]; 148 if (sLen >= len) { 149 const sal_Unicode *dstr = dataArea + lenArray[i-1]; 150 sal_Int32 pos = 0; 151 152 while (pos < len && dstr[pos] == str[pos]) { pos++; } 153 154 if (pos == len) 155 return len + 1; 156 } 157 } 158 return 0; 159 } 160 161 162 /* 163 * c-tor 164 */ 165 166 WordBreakCache::WordBreakCache() : 167 length( 0 ), 168 contents( NULL ), 169 wordboundary( NULL ), 170 size( 0 ) 171 { 172 } 173 174 /* 175 * Compare two unicode string, 176 */ 177 178 sal_Bool WordBreakCache::equals(const sal_Unicode* str, Boundary& boundary) { 179 // Different length, different string. 180 if (length != boundary.endPos - boundary.startPos) return sal_False; 181 182 for (sal_Int32 i = 0; i < length; i++) 183 if (contents[i] != str[i + boundary.startPos]) return sal_False; 184 185 return sal_True; 186 } 187 188 189 /* 190 * Retrieve the segment containing the character at pos. 191 * @param pos : Position of the given character. 192 * @return true if CJK. 193 */ 194 sal_Bool xdictionary::seekSegment(const rtl::OUString &rText, sal_Int32 pos, 195 Boundary& segBoundary) 196 { 197 sal_Int32 indexUtf16; 198 segBoundary.endPos = segBoundary.startPos = pos; 199 200 indexUtf16 = pos; 201 while (indexUtf16 > 0) 202 { 203 sal_uInt32 ch = rText.iterateCodePoints(&indexUtf16, -1); 204 if (u_isWhitespace(ch) || exists(ch)) 205 segBoundary.startPos = indexUtf16; 206 else 207 break; 208 } 209 210 indexUtf16 = pos; 211 while (indexUtf16 < rText.getLength()) 212 { 213 sal_uInt32 ch = rText.iterateCodePoints(&indexUtf16, 1); 214 if (u_isWhitespace(ch) || exists(ch)) 215 segBoundary.endPos = indexUtf16; 216 else 217 break; 218 } 219 220 indexUtf16 = segBoundary.startPos; 221 rText.iterateCodePoints(&indexUtf16, 1); 222 return segBoundary.endPos > indexUtf16; 223 } 224 225 #define KANJA 1 226 #define KATAKANA 2 227 #define HIRAKANA 3 228 229 static sal_Int16 JapaneseCharType(sal_Unicode c) 230 { 231 if (0x3041 <= c && c <= 0x309e) 232 return HIRAKANA; 233 if ((0x30a1 <= c && c <= 0x30fe) || (0xff65 <= c && c <= 0xff9f)) 234 return KATAKANA; 235 return KANJA; 236 } 237 238 WordBreakCache& xdictionary::getCache(const sal_Unicode *text, Boundary& wordBoundary) 239 { 240 241 WordBreakCache& aCache = cache[text[0] & 0x1f]; 242 243 if (aCache.size != 0 && aCache.equals(text, wordBoundary)) 244 return aCache; 245 246 sal_Int32 len = wordBoundary.endPos - wordBoundary.startPos; 247 248 if (aCache.size == 0 || len > aCache.size) { 249 if (aCache.size != 0) { 250 delete aCache.contents; 251 delete aCache.wordboundary; 252 aCache.size = len; 253 } 254 else 255 aCache.size = len > DEFAULT_SIZE ? len : DEFAULT_SIZE; 256 aCache.contents = new sal_Unicode[aCache.size + 1]; 257 aCache.wordboundary = new sal_Int32[aCache.size + 2]; 258 } 259 aCache.length = len; 260 memcpy(aCache.contents, text + wordBoundary.startPos, len * sizeof(sal_Unicode)); 261 *(aCache.contents + len) = 0x0000; 262 // reset the wordboundary in cache 263 memset(aCache.wordboundary, '\0', sizeof(sal_Int32)*(len + 2)); 264 265 sal_Int32 i = 0; // loop variable 266 while (aCache.wordboundary[i] < aCache.length) { 267 len = 0; 268 // look the continuous white space as one word and cashe it 269 while (u_isWhitespace((sal_uInt32)text[wordBoundary.startPos + aCache.wordboundary[i] + len])) 270 len ++; 271 272 if (len == 0) { 273 const sal_Unicode *str = text + wordBoundary.startPos + aCache.wordboundary[i]; 274 sal_Int32 slen = aCache.length - aCache.wordboundary[i]; 275 sal_Int16 type = 0, count = 0; 276 for (;len == 0 && slen > 0; str++, slen--) { 277 len = getLongestMatch(str, slen); 278 if (len == 0) { 279 if (!japaneseWordBreak) { 280 len = 1; 281 } else { 282 if (count == 0) 283 type = JapaneseCharType(*str); 284 else if (type != JapaneseCharType(*str)) 285 break; 286 count++; 287 } 288 } 289 } 290 if (count) { 291 aCache.wordboundary[i+1] = aCache.wordboundary[i] + count; 292 i++; 293 294 #if USE_CELL_BOUNDARY_CODE 295 if (useCellBoundary) { 296 sal_Int32 cBoundary = cellBoundary[aCache.wordboundary[i] + wordBoundary.startPos - 1]; 297 if (cBoundary > 0) 298 aCache.wordboundary[i] = cBoundary - wordBoundary.startPos; 299 } 300 #endif 301 } 302 } 303 304 if (len) { 305 aCache.wordboundary[i+1] = aCache.wordboundary[i] + len; 306 i++; 307 308 #if USE_CELL_BOUNDARY_CODE 309 if (useCellBoundary) { 310 sal_Int32 cBoundary = cellBoundary[aCache.wordboundary[i] + wordBoundary.startPos - 1]; 311 if (cBoundary > 0) 312 aCache.wordboundary[i] = cBoundary - wordBoundary.startPos; 313 } 314 #endif 315 } 316 } 317 aCache.wordboundary[i + 1] = aCache.length + 1; 318 319 return aCache; 320 } 321 322 Boundary xdictionary::previousWord(const OUString& rText, sal_Int32 anyPos, sal_Int16 wordType) 323 { 324 // looking for the first non-whitespace character from anyPos 325 sal_uInt32 ch = rText.iterateCodePoints(&anyPos, -1); 326 327 while (anyPos > 0 && u_isWhitespace(ch)) ch = rText.iterateCodePoints(&anyPos, -1); 328 329 return getWordBoundary(rText, anyPos, wordType, true); 330 } 331 332 Boundary xdictionary::nextWord(const OUString& rText, sal_Int32 anyPos, sal_Int16 wordType) 333 { 334 boundary = getWordBoundary(rText, anyPos, wordType, true); 335 anyPos = boundary.endPos; 336 if (anyPos < rText.getLength()) { 337 // looknig for the first non-whitespace character from anyPos 338 sal_uInt32 ch = rText.iterateCodePoints(&anyPos, 1); 339 while (u_isWhitespace(ch)) ch=rText.iterateCodePoints(&anyPos, 1); 340 rText.iterateCodePoints(&anyPos, -1); 341 } 342 343 return getWordBoundary(rText, anyPos, wordType, true); 344 } 345 346 Boundary xdictionary::getWordBoundary(const OUString& rText, sal_Int32 anyPos, sal_Int16 wordType, sal_Bool bDirection) 347 { 348 const sal_Unicode *text=rText.getStr(); 349 sal_Int32 len=rText.getLength(); 350 if (anyPos >= len || anyPos < 0) { 351 boundary.startPos = boundary.endPos = anyPos < 0 ? 0 : len; 352 } else if (seekSegment(rText, anyPos, boundary)) { // character in dict 353 WordBreakCache& aCache = getCache(text, boundary); 354 sal_Int32 i = 0; 355 356 while (aCache.wordboundary[i] <= anyPos - boundary.startPos) i++; 357 358 sal_Int32 startPos = aCache.wordboundary[i - 1]; 359 // if bDirection is false 360 if (!bDirection && startPos > 0 && startPos == (anyPos - boundary.startPos)) 361 { 362 sal_Int32 indexUtf16 = anyPos-1; 363 sal_uInt32 ch = rText.iterateCodePoints(&indexUtf16, 1); 364 if (u_isWhitespace(ch)) 365 i--; 366 } 367 boundary.endPos = boundary.startPos; 368 rText.iterateCodePoints(&boundary.endPos, aCache.wordboundary[i]); 369 rText.iterateCodePoints(&boundary.startPos, aCache.wordboundary[i-1]); 370 } else { 371 boundary.startPos = anyPos; 372 if (anyPos < len) rText.iterateCodePoints(&anyPos, 1); 373 boundary.endPos = anyPos < len ? anyPos : len; 374 } 375 if (wordType == WordType::WORD_COUNT) { 376 // skip punctuation for word count. 377 while (boundary.endPos < len) 378 { 379 sal_Int32 indexUtf16 = boundary.endPos; 380 if (u_ispunct(rText.iterateCodePoints(&indexUtf16, 1))) 381 boundary.endPos = indexUtf16; 382 else 383 break; 384 } 385 } 386 387 return boundary; 388 } 389 390 #if USE_CELL_BOUNDARY_CODE 391 void xdictionary::setCellBoundary(sal_Int32* cellArray) 392 { 393 useCellBoundary = sal_True; 394 cellBoundary = cellArray; 395 } 396 #endif 397 398 } } } } 399