source/breakiterator/xdictionary.cxx

/**************************************************************
 *
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 *
 *************************************************************/


// MARKER(update_precomp.py): autogen include statement, do not remove
#include "precompiled_i18npool.hxx"

// xdictionary.cpp: implementation of the xdictionary class.
//
//////////////////////////////////////////////////////////////////////


#include <rtl/ustrbuf.hxx>

#include <com/sun/star/i18n/WordType.hpp>
#include <xdictionary.hxx>
#include <unicode/uchar.h>
#include <string.h>
#include <breakiteratorImpl.hxx>

//////////////////////////////////////////////////////////////////////
// Construction/Destruction
//////////////////////////////////////////////////////////////////////

using namespace rtl;

namespace com { namespace sun { namespace star { namespace i18n {

extern "C" { static void SAL_CALL thisModule() {} }

xdictionary::xdictionary(const sal_Char *lang) :
    existMark( NULL ),
    index1( NULL ),
    index2( NULL ),
    lenArray( NULL ),
    dataArea( NULL ),
    hModule( NULL ),
    boundary(),
    japaneseWordBreak( sal_False )
#if USE_CELL_BOUNDARY_CODE
    // For CTL breakiterator, where the word boundary should not be inside cell.
    ,
    useCellBoundary( sal_False ),
    cellBoundary( NULL )
#endif
{
	index1 = 0;
#ifdef SAL_DLLPREFIX
    OUStringBuffer aBuf( strlen(lang) + 7 + 6 );    // mostly "lib*.so" (with * == dict_zh)
    aBuf.appendAscii( SAL_DLLPREFIX );
#else
    OUStringBuffer aBuf( strlen(lang) + 7 + 4 );    // mostly "*.dll" (with * == dict_zh)
#endif
    aBuf.appendAscii( "dict_" ).appendAscii( lang ).appendAscii( SAL_DLLEXTENSION );
        hModule = osl_loadModuleRelative( &thisModule, aBuf.makeStringAndClear().pData, SAL_LOADMODULE_DEFAULT );
        if( hModule ) {
            sal_IntPtr (*func)();
            func = (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule, OUString::createFromAscii("getExistMark").pData );
            existMark = (sal_uInt8*) (*func)();
            func = (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule, OUString::createFromAscii("getIndex1").pData );
            index1 = (sal_Int16*) (*func)();
            func = (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule, OUString::createFromAscii("getIndex2").pData );
            index2 = (sal_Int32*) (*func)();
            func = (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule, OUString::createFromAscii("getLenArray").pData );
            lenArray = (sal_Int32*) (*func)();
            func = (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule, OUString::createFromAscii("getDataArea").pData );
            dataArea = (sal_Unicode*) (*func)();
        }
        else
		{
            existMark = NULL;
			index1 = NULL;
			index2 = NULL;
			lenArray = NULL;
			dataArea = NULL;
		}

		for (sal_Int32 i = 0; i < CACHE_MAX; i++)
            cache[i].size = 0;

#if USE_CELL_BOUNDARY_CODE
        useCellBoundary = sal_False;
        cellBoundary = NULL;
#endif
        japaneseWordBreak = sal_False;
}

xdictionary::~xdictionary() {
        osl_unloadModule(hModule);
        for (sal_Int32 i = 0; i < CACHE_MAX; i++) {
            if (cache[i].size > 0) {
                delete cache[i].contents;
                delete cache[i].wordboundary;
            }
        }
}

void xdictionary::setJapaneseWordBreak()
{
        japaneseWordBreak = sal_True;
}

sal_Bool xdictionary::exists(const sal_uInt32 c) {
        // 0x1FFF is the hardcoded limit in gendict for existMarks
        sal_Bool exist = (existMark && ((c>>3) < 0x1FFF)) ? sal::static_int_cast<sal_Bool>((existMark[c>>3] & (1<<(c&0x07))) != 0) : sal_False;
        if (!exist && japaneseWordBreak)
            return BreakIteratorImpl::getScriptClass(c) == ScriptType::ASIAN;
        else
            return exist;
}

sal_Int32 xdictionary::getLongestMatch(const sal_Unicode* str, sal_Int32 sLen) {

		if ( !index1 ) return 0;

        sal_Int16 idx = index1[str[0] >> 8];

        if (idx == 0xFF) return 0;

        idx = (idx<<8) | (str[0]&0xff);

        sal_uInt32 begin = index2[idx], end = index2[idx+1];

        if (begin == 0) return 0;

        str++; sLen--; // first character is not stored in the dictionary
        for (sal_uInt32 i = end; i > begin; i--) {
            sal_Int32 len = lenArray[i] - lenArray[i - 1];
            if (sLen >= len) {
                const sal_Unicode *dstr = dataArea + lenArray[i-1];
                sal_Int32 pos = 0;

                while (pos < len && dstr[pos] == str[pos]) { pos++; }

                if (pos == len)
                    return len + 1;
            }
        }
        return 0;
}


/*
 * c-tor
 */

WordBreakCache::WordBreakCache() :
    length( 0 ),
    contents( NULL ),
    wordboundary( NULL ),
    size( 0 )
{
}

/*
 * Compare two unicode string,
 */

sal_Bool WordBreakCache::equals(const sal_Unicode* str, Boundary& boundary) {
        // Different length, different string.
        if (length != boundary.endPos - boundary.startPos) return sal_False;

        for (sal_Int32 i = 0; i < length; i++)
            if (contents[i] != str[i + boundary.startPos]) return sal_False;

        return sal_True;
}


/*
 * Retrieve the segment containing the character at pos.
 * @param pos : Position of the given character.
 * @return true if CJK.
 */
sal_Bool xdictionary::seekSegment(const rtl::OUString &rText, sal_Int32 pos,
	Boundary& segBoundary)
{
    sal_Int32 indexUtf16;
    segBoundary.endPos = segBoundary.startPos = pos;

    indexUtf16 = pos;
    while (indexUtf16 > 0)
    {
        sal_uInt32 ch = rText.iterateCodePoints(&indexUtf16, -1);
        if (u_isWhitespace(ch) || exists(ch))
            segBoundary.startPos = indexUtf16;
        else
            break;
    }

    indexUtf16 = pos;
    while (indexUtf16 < rText.getLength())
    {
        sal_uInt32 ch = rText.iterateCodePoints(&indexUtf16, 1);
        if (u_isWhitespace(ch) || exists(ch))
            segBoundary.endPos = indexUtf16;
        else
            break;
    }

    indexUtf16 = segBoundary.startPos;
    rText.iterateCodePoints(&indexUtf16, 1);
    return segBoundary.endPos > indexUtf16;
}

#define KANJA       1
#define KATAKANA    2
#define HIRAKANA    3

static sal_Int16 JapaneseCharType(sal_Unicode c)
{
    if (0x3041 <= c && c <= 0x309e)
        return HIRAKANA;
    if ((0x30a1 <= c && c <= 0x30fe) || (0xff65 <= c && c <= 0xff9f))
        return KATAKANA;
    return KANJA;
}

WordBreakCache& xdictionary::getCache(const sal_Unicode *text, Boundary& wordBoundary)
{

        WordBreakCache& aCache = cache[text[0] & 0x1f];

        if (aCache.size != 0 && aCache.equals(text, wordBoundary))
            return aCache;

        sal_Int32 len = wordBoundary.endPos - wordBoundary.startPos;

        if (aCache.size == 0 || len > aCache.size) {
            if (aCache.size != 0) {
                delete aCache.contents;
                delete aCache.wordboundary;
                aCache.size = len;
            }
            else
                aCache.size = len > DEFAULT_SIZE ? len : DEFAULT_SIZE;
            aCache.contents = new sal_Unicode[aCache.size + 1];
            aCache.wordboundary = new sal_Int32[aCache.size + 2];
        }
        aCache.length  = len;
        memcpy(aCache.contents, text + wordBoundary.startPos, len * sizeof(sal_Unicode));
        *(aCache.contents + len) = 0x0000;
        // reset the wordboundary in cache
        memset(aCache.wordboundary, '\0', sizeof(sal_Int32)*(len + 2));

        sal_Int32 i = 0;        // loop variable
        while (aCache.wordboundary[i] < aCache.length) {
            len = 0;
            // look the continuous white space as one word and cashe it
            while (u_isWhitespace((sal_uInt32)text[wordBoundary.startPos + aCache.wordboundary[i] + len]))
                len ++;

            if (len == 0) {
                const sal_Unicode *str = text + wordBoundary.startPos + aCache.wordboundary[i];
                sal_Int32 slen = aCache.length - aCache.wordboundary[i];
                sal_Int16 type = 0, count = 0;
                for (;len == 0 && slen > 0; str++, slen--) {
                    len = getLongestMatch(str, slen);
                    if (len == 0) {
                        if (!japaneseWordBreak) {
                            len = 1;
                        } else {
                            if (count == 0)
                                type = JapaneseCharType(*str);
                            else if (type != JapaneseCharType(*str))
                                break;
                            count++;
                        }
                    }
                }
                if (count) {
                    aCache.wordboundary[i+1] = aCache.wordboundary[i] + count;
                    i++;

#if USE_CELL_BOUNDARY_CODE
                    if (useCellBoundary) {
                        sal_Int32 cBoundary = cellBoundary[aCache.wordboundary[i] + wordBoundary.startPos - 1];
                        if (cBoundary > 0)
                            aCache.wordboundary[i] = cBoundary - wordBoundary.startPos;
                    }
#endif
                }
            }

            if (len) {
                aCache.wordboundary[i+1] = aCache.wordboundary[i] + len;
                i++;

#if USE_CELL_BOUNDARY_CODE
                if (useCellBoundary) {
                    sal_Int32 cBoundary = cellBoundary[aCache.wordboundary[i] + wordBoundary.startPos - 1];
                    if (cBoundary > 0)
                        aCache.wordboundary[i] = cBoundary - wordBoundary.startPos;
                }
#endif
            }
        }
        aCache.wordboundary[i + 1] = aCache.length + 1;

        return aCache;
}

Boundary xdictionary::previousWord(const OUString& rText, sal_Int32 anyPos, sal_Int16 wordType)
{
        // looking for the first non-whitespace character from anyPos
        sal_uInt32 ch = rText.iterateCodePoints(&anyPos, -1);

        while (anyPos > 0 && u_isWhitespace(ch)) ch = rText.iterateCodePoints(&anyPos, -1);

        return getWordBoundary(rText, anyPos, wordType, true);
}

Boundary xdictionary::nextWord(const OUString& rText, sal_Int32 anyPos, sal_Int16 wordType)
{
        boundary = getWordBoundary(rText, anyPos, wordType, true);
        anyPos = boundary.endPos;
        if (anyPos < rText.getLength()) {
            // looknig for the first non-whitespace character from anyPos
            sal_uInt32 ch = rText.iterateCodePoints(&anyPos, 1);
            while (u_isWhitespace(ch)) ch=rText.iterateCodePoints(&anyPos, 1);
            rText.iterateCodePoints(&anyPos, -1);
        }

        return getWordBoundary(rText, anyPos, wordType, true);
}

Boundary xdictionary::getWordBoundary(const OUString& rText, sal_Int32 anyPos, sal_Int16 wordType, sal_Bool bDirection)
{
        const sal_Unicode *text=rText.getStr();
        sal_Int32 len=rText.getLength();
        if (anyPos >= len || anyPos < 0) {
            boundary.startPos = boundary.endPos = anyPos < 0 ? 0 : len;
        } else if (seekSegment(rText, anyPos, boundary)) {          // character in dict
            WordBreakCache& aCache = getCache(text, boundary);
            sal_Int32 i = 0;

            while (aCache.wordboundary[i] <= anyPos - boundary.startPos) i++;

            sal_Int32 startPos = aCache.wordboundary[i - 1];
            // if bDirection is false
            if (!bDirection && startPos > 0 && startPos == (anyPos - boundary.startPos))
            {
                sal_Int32 indexUtf16 = anyPos-1;
                sal_uInt32 ch = rText.iterateCodePoints(&indexUtf16, 1);
                if (u_isWhitespace(ch))
                    i--;
            }
            boundary.endPos = boundary.startPos;
            rText.iterateCodePoints(&boundary.endPos, aCache.wordboundary[i]);
            rText.iterateCodePoints(&boundary.startPos, aCache.wordboundary[i-1]);
        } else {
            boundary.startPos = anyPos;
            if (anyPos < len) rText.iterateCodePoints(&anyPos, 1);
            boundary.endPos = anyPos < len ? anyPos : len;
        }
        if (wordType == WordType::WORD_COUNT) {
            // skip punctuation for word count.
            while (boundary.endPos < len)
            {
                sal_Int32 indexUtf16 = boundary.endPos;
                if (u_ispunct(rText.iterateCodePoints(&indexUtf16, 1)))
                    boundary.endPos = indexUtf16;
                else
                    break;
            }
        }

        return boundary;
}

#if USE_CELL_BOUNDARY_CODE
void xdictionary::setCellBoundary(sal_Int32* cellArray)
{
        useCellBoundary = sal_True;
        cellBoundary = cellArray;
}
#endif

} } } }