wtratree.cxx (revision 3cd96b95fb0ad23ccdd883f9b15a685c459d45ca) - OpenGrok cross reference for /AOO41X/main/l10ntools/source/wtratree.cxx

/**************************************************************
 *
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 *
 *************************************************************/


// MARKER(update_precomp.py): autogen include statement, do not remove
#include "precompiled_l10ntools.hxx"


#include "wtratree.hxx"


/** @ATTENTION
    For reasons of speed, class WordTransTree works with two simple
    char arrays, sOutput and sInput, instead of secure containers or
    streams. So be extremely careful, when changing this code!!!
**/


// NOT FULLY DECLARED SERVICES
#include <string.h>
#include <stdio.h>
#include <ctype.h>
#include "wtranode.hxx"


const BRANCH_T  BR_END          = 0;
const BRANCH_T  BR_NONALPHA     = 1;
const BRANCH_T  BR_HOTKEY       = 2;
const BRANCH_T  BR_BACKSLASH    = 3;
const BRANCH_T  BR_ALPHABASE    = 4;    /// @ATTENTION  All branches not valid for words must be smaller than this value!
const BRANCH_T  BR_AE           = 30;
const BRANCH_T  BR_OE           = 31;
const BRANCH_T  BR_UE           = 32;
const BRANCH_T  BR_SZ           = 33;
const BRANCH_T  BR_MAX          = 34;   /// @ATTENTION  Must be updated always!

const BRANCH_T  BR_START        = 0;


WordTransTree::WordTransTree(CharSet  i_nWorkingCharSet)
    :   sInput(0),
        nInputLength(0),
        pInputEnd(0),
        sOutput(0),
        nOutputMaxLength(0),
        dpParsingTreeTop(0),
        pUnknownAlpha(0),
        // cChar2Branch
        c_AE(u_char('\xC4')), c_OE(u_char('\xD6')), c_UE(u_char('\xDC')),
        c_ae(u_char('\xE4')), c_oe(u_char('\xF6')), c_ue(u_char('\xFC')),
        pInputCurTokenStart(0),
        pInputPosition(0),
        pOutputPosition(0),
        pCurParseNode(0),
        eCurResult(OK),
        cCurHotkey(0),
        cCurHotkeySign(u_char('~'))
{
    // Initialize parsing tree:
    pUnknownAlpha = new WTT_Node(BR_ALPHABASE,0,0); // This will be deleted as part of the parsing tree.
    for ( UINT8 i = BR_ALPHABASE; i < C_NR_OF_BRANCHES; i++)
    {
        pUnknownAlpha->SetBranch(i,pUnknownAlpha);
    }  // end for

    dpParsingTreeTop = new WTT_Node(BR_START,0,pUnknownAlpha);

    WTT_Node * dpNonAlpha = new WTT_Node(BR_NONALPHA,0,0);

    dpNonAlpha->SetBranch(BR_NONALPHA,dpNonAlpha);
    dpParsingTreeTop->SetBranch(BR_NONALPHA,dpNonAlpha);

    WTT_Node * dpBackslash = new WTT_Node(BR_BACKSLASH,dpNonAlpha,dpNonAlpha);
    dpBackslash->SetBranch(BR_END,0);

    dpParsingTreeTop->SetBranch(BR_BACKSLASH,dpBackslash);
    dpNonAlpha->SetBranch(BR_BACKSLASH,dpBackslash);


    // Initialize character set:
    SetCharSet(i_nWorkingCharSet);

    if (C_BR_ALPHABASE != BR_ALPHABASE || C_NR_OF_BRANCHES != BR_MAX)
    {
        fprintf(stderr, "Assertion failed: file %s line %d.", __FILE__,  __LINE__);
        exit(1);
    }
}

void
WordTransTree::SetCharSet(CharSet i_nWorkingCharSet)
{
    ByteString sConvert("\xC4\xD6\xDC\xE4\xF6\xFC\xDF");
    const u_char * pConvert = (const u_char * ) ( sConvert.Convert(RTL_TEXTENCODING_MS_1252, i_nWorkingCharSet).GetBuffer() );

    INT16 i = 0;
    for ( ; i < C_NR_OF_POSSIBLE_CHARS; ++i )
    {
        cChar2Branch[i] = BR_NONALPHA;
    }  // end for
    for ( i = 'a'; i <= 'z'; ++i )
    {
        cChar2Branch[i] = BR_ALPHABASE + i - 'a';
    }  // end for
    for ( i = 'A'; i <= 'Z'; ++i )
    {
        cChar2Branch[i] = BR_ALPHABASE + i - 'A';
    }  // end for
    cChar2Branch[pConvert[0]] = BR_AE;
    cChar2Branch[pConvert[1]] = BR_OE;
    cChar2Branch[pConvert[2]] = BR_UE;
    cChar2Branch[pConvert[3]] = BR_AE;
    cChar2Branch[pConvert[4]] = BR_OE;
    cChar2Branch[pConvert[5]] = BR_UE;
    cChar2Branch[pConvert[6]] = BR_SZ;

    cChar2Branch[u_char('~')] = BR_HOTKEY;
    cChar2Branch[u_char('&')] = BR_HOTKEY;


    c_AE = pConvert[0];
    c_OE = pConvert[1];
    c_UE = pConvert[2];
    c_ae = pConvert[3];
    c_oe = pConvert[4];
    c_ue = pConvert[5];
}

WordTransTree::~WordTransTree()
{
    delete dpParsingTreeTop;
    if (sOutput != 0)
        delete [] sOutput;
}

void
WordTransTree::AddWordPair( const ByteString &      i_sOldString,
                            const ByteString &      i_sReplaceString )
{
    if (i_sOldString.Len() == 0)
        return;

    pCurParseNode = dpParsingTreeTop;
    WTT_Node * pBranch = 0;
    char cBranch = 0;

    for ( constr pOld = i_sOldString.GetBuffer();
          *pOld != 0;
          pOld++ )
    {
        cBranch = CalculateBranch(*pOld);
        pBranch = pCurParseNode->GetNextNode(cBranch);
        if (pBranch == 0 || pBranch == pUnknownAlpha)
        {
            pBranch = new WTT_Node(cBranch,0,pUnknownAlpha);
            pCurParseNode->SetBranch(cBranch,pBranch);
        }
        pCurParseNode = pBranch;
    }   // end for
    pCurParseNode->SetAsTokenToReplace(i_sReplaceString);
}

void
WordTransTree::InitTransformation( const char * i_sInput,
                                   UINT32       i_nInputLength,
                                   UINT32       i_nOutputMaxLength )
{
    sInput = (const u_char *)i_sInput;
    nInputLength = i_nInputLength;
    pInputEnd = &sInput[i_nInputLength];

    pInputCurTokenStart = sInput;
    pInputPosition = sInput;

    if (nOutputMaxLength < i_nOutputMaxLength)
    {
        if (sOutput != 0)
            delete [] sOutput;
        sOutput = new unsigned char[i_nOutputMaxLength];
        nOutputMaxLength = i_nOutputMaxLength;
    }
    pOutputPosition = sOutput;
}

/** pInputCurTokenStart and CurParseNode are updated just when
    starting this function. After its end they must not be changed
    till this functon is called again.
    Outside this function pInputPositon and pOutputPosition are both
    on the first not transformed char in their respective array.
**/
WordTransTree::E_Result
WordTransTree::TransformNextToken()
{
    pInputCurTokenStart = pInputPosition;
    pCurParseNode = dpParsingTreeTop;
    cCurHotkey = 0;
    eCurResult = OK;

    WTT_Node * pBranch = 0;
    UINT8 cBranch = 0;

    for ( pCurParseNode = dpParsingTreeTop;
          pInputPosition != pInputEnd;
          ++pInputPosition )
    {
        cBranch = CalculateBranch(*pInputPosition);
        pBranch = pCurParseNode->GetNextNode( cBranch );
        if (pBranch != 0)
        {
            pCurParseNode = pBranch;
        }
        else
        {
            if (cBranch == BR_HOTKEY)   // current letter is '~' or '&'.
            {
                // Logic of the following. There are 9 possible cases -
                // A = alphabetic letter, NA = non alphabetic, TB = token begin,
                // Eot = end of text:
                //   1. A~A          set hotkey to following letter, continue
                //   2. A~NA         token end
                //   3. A~Eot        token end
                //   4. NA~A         token end
                //   5. NA~NA        continue
                //   6. A~Eof        continue
                //   7. TB~A         set hotkey to following letter, continue
                //   8. TB~NA        continue
                //   9. TB~Eot       continue

                // bNext and Prev are true, if there are alphabetic letters:
                sal_Bool bNext =  pInputPosition + 1 != pInputEnd
                                    ?   CalculateBranch(pInputPosition[1]) >= BR_ALPHABASE
                                    :   sal_False;
                sal_Bool bPrev = pCurParseNode->Value() >= BR_ALPHABASE;

                if ( bNext && (bPrev || pCurParseNode == dpParsingTreeTop) )
                {   // case 1. and 7.
                    Handle_Hotkey();
                    continue;
                }
                else if  (!bPrev && !bNext)
                {   // case 5.,6.,8.,9.
                    continue;
                }

                // Case 2.,3.,4. :
                //  so this should be handled as an end of a token.
            }
            if (pCurParseNode->TokenType() == WTT_Node::token_to_keep)
            {
                Handle_TokenToKeep();
                return eCurResult;
            }
            else
            {
                Handle_TokenToTransform();
                return eCurResult;
            }   // endif (pCurParseNode->TokenType() == WTT_Node::token_to_keep)
        }   // endif (pBranch == 0) else
    }   // end for

    // If here, the text end is reached
    if (pCurParseNode->TokenType() == WTT_Node::token_to_keep)
    {
        Handle_TokenToKeep();
        return eCurResult;
    }
    else
    {
        Handle_TokenToTransform();
        return eCurResult;
    }
}

ByteString
WordTransTree::CurReplacingString() const
{
    return pCurParseNode->ReplaceString();
}

void
WordTransTree::Handle_Hotkey()
{
    if (cCurHotkey == 0)    // Avoid to replace the first found hotkey by
                            //   a later one - though this shouldn't happen anyway.
    {
        cCurHotkey = (pInputPosition+1) != pInputEnd ? pInputPosition[1] : 0;
        cCurHotkeySign = *pInputPosition;
    }
}

void
WordTransTree::Handle_TokenToKeep()
{
    UINT32 nTokenLength = pInputPosition-pInputCurTokenStart;

    memcpy(pOutputPosition,pInputCurTokenStart,nTokenLength);

    pOutputPosition += nTokenLength;
    *pOutputPosition = '\0';
}

void
WordTransTree::Handle_TokenToTransform()
{
    sal_Bool bHaveHotkey = CalculateBranch(cCurHotkey) >= BR_ALPHABASE;
    const ByteString & rReplace = pCurParseNode->ReplaceString();

    // Find position of hotkey in replace-string:
    sal_uInt16 nHotkeyPos = bHaveHotkey
                            ?   rReplace.Search(char(cCurHotkey))
                            :   STRING_NOTFOUND;
    if (nHotkeyPos == STRING_NOTFOUND && bHaveHotkey)
    {
        if (cCurHotkey < 128)
        {
            if (islower(cCurHotkey))
                nHotkeyPos = rReplace.Search(toupper(char(cCurHotkey)));
            else
                nHotkeyPos = rReplace.Search(tolower(char(cCurHotkey)));
        }
        else    // cCurHotkey >= 128
        {
            if (cCurHotkey == c_ae)
                nHotkeyPos = rReplace.Search(char(c_AE));
            else if (cCurHotkey == c_oe)
                nHotkeyPos = rReplace.Search(char(c_OE));
            else if (cCurHotkey == c_ue)
                nHotkeyPos = rReplace.Search(char(c_UE));
            else if (cCurHotkey == c_AE)
                nHotkeyPos = rReplace.Search(char(c_ae));
            else if (cCurHotkey == c_OE)
                nHotkeyPos = rReplace.Search(char(c_oe));
            else if (cCurHotkey == c_UE)
                nHotkeyPos = rReplace.Search(char(c_ue));
        }   // endif (cCurHotkey < 128) else

        if (nHotkeyPos == STRING_NOTFOUND)
        {
            eCurResult = HOTKEY_LOST;
            bHaveHotkey = sal_False;
        }
    }   // endif (nHotkeyPos == STRING_NOT_FOUND && bHaveHotkey)


    UINT32 nOutputTokenLength = rReplace.Len() + (bHaveHotkey ? 1 : 0);

    if (bHaveHotkey)
    {
        memcpy( pOutputPosition,
                pCurParseNode->ReplaceString().GetBuffer(),
                nHotkeyPos );
        *(pOutputPosition + nHotkeyPos) = cCurHotkeySign;
        memcpy( pOutputPosition + nHotkeyPos + 1,
                pCurParseNode->ReplaceString().GetBuffer() + nHotkeyPos,
                nOutputTokenLength - nHotkeyPos - 1);
    }
    else
    {
        memcpy( pOutputPosition,
                pCurParseNode->ReplaceString().GetBuffer(),
                nOutputTokenLength );
    }

    // Convert first letter into upper if necessary:
    u_char cInStart = CalculateBranch(*pInputCurTokenStart) == BR_HOTKEY
                            ?   pInputCurTokenStart[1]
                            :   pInputCurTokenStart[0] ;
    u_char * pOutStart = nHotkeyPos == 0
                            ?   pOutputPosition + 1
                            :   pOutputPosition ;
    if (isupper(cInStart) || cInStart > 127)
    {   // Possibly cInStart is upper character:
        if (isupper(cInStart) || cInStart == c_AE || cInStart == c_OE || cInStart == c_UE)
        {   // Surely cInStart is upper character:
            u_char cOutStart = *pOutStart;
            if (cOutStart < 128)
                *pOutStart = toupper(cOutStart);
            else if (cOutStart == c_ae)
                *pOutStart = c_AE;
            else if (cOutStart == c_oe)
                *pOutStart = c_OE;
            else if (cOutStart == c_ue)
                *pOutStart = c_UE;
        }
    }   // endif (isupper(cInStart) || cInStart > 127)

    pOutputPosition += nOutputTokenLength;
    *pOutputPosition = '\0';
}