xref: /AOO41X/main/l10ntools/source/wtratree.cxx (revision 3cd96b95fb0ad23ccdd883f9b15a685c459d45ca)
1 /**************************************************************
2  *
3  * Licensed to the Apache Software Foundation (ASF) under one
4  * or more contributor license agreements.  See the NOTICE file
5  * distributed with this work for additional information
6  * regarding copyright ownership.  The ASF licenses this file
7  * to you under the Apache License, Version 2.0 (the
8  * "License"); you may not use this file except in compliance
9  * with the License.  You may obtain a copy of the License at
10  *
11  *   http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing,
14  * software distributed under the License is distributed on an
15  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16  * KIND, either express or implied.  See the License for the
17  * specific language governing permissions and limitations
18  * under the License.
19  *
20  *************************************************************/
21 
22 
23 
24 // MARKER(update_precomp.py): autogen include statement, do not remove
25 #include "precompiled_l10ntools.hxx"
26 
27 
28 #include "wtratree.hxx"
29 
30 
31 
32 /** @ATTENTION
33     For reasons of speed, class WordTransTree works with two simple
34     char arrays, sOutput and sInput, instead of secure containers or
35     streams. So be extremely careful, when changing this code!!!
36 **/
37 
38 
39 
40 // NOT FULLY DECLARED SERVICES
41 #include <string.h>
42 #include <stdio.h>
43 #include <ctype.h>
44 #include "wtranode.hxx"
45 
46 
47 const BRANCH_T  BR_END          = 0;
48 const BRANCH_T  BR_NONALPHA     = 1;
49 const BRANCH_T  BR_HOTKEY       = 2;
50 const BRANCH_T  BR_BACKSLASH    = 3;
51 const BRANCH_T  BR_ALPHABASE    = 4;    /// @ATTENTION  All branches not valid for words must be smaller than this value!
52 const BRANCH_T  BR_AE           = 30;
53 const BRANCH_T  BR_OE           = 31;
54 const BRANCH_T  BR_UE           = 32;
55 const BRANCH_T  BR_SZ           = 33;
56 const BRANCH_T  BR_MAX          = 34;   /// @ATTENTION  Must be updated always!
57 
58 const BRANCH_T  BR_START        = 0;
59 
60 
61 
62 
63 
WordTransTree(CharSet i_nWorkingCharSet)64 WordTransTree::WordTransTree(CharSet  i_nWorkingCharSet)
65     :   sInput(0),
66         nInputLength(0),
67         pInputEnd(0),
68         sOutput(0),
69         nOutputMaxLength(0),
70         dpParsingTreeTop(0),
71         pUnknownAlpha(0),
72         // cChar2Branch
73         c_AE(u_char('\xC4')), c_OE(u_char('\xD6')), c_UE(u_char('\xDC')),
74         c_ae(u_char('\xE4')), c_oe(u_char('\xF6')), c_ue(u_char('\xFC')),
75         pInputCurTokenStart(0),
76         pInputPosition(0),
77         pOutputPosition(0),
78         pCurParseNode(0),
79         eCurResult(OK),
80         cCurHotkey(0),
81         cCurHotkeySign(u_char('~'))
82 {
83     // Initialize parsing tree:
84     pUnknownAlpha = new WTT_Node(BR_ALPHABASE,0,0); // This will be deleted as part of the parsing tree.
85     for ( UINT8 i = BR_ALPHABASE; i < C_NR_OF_BRANCHES; i++)
86     {
87         pUnknownAlpha->SetBranch(i,pUnknownAlpha);
88     }  // end for
89 
90     dpParsingTreeTop = new WTT_Node(BR_START,0,pUnknownAlpha);
91 
92     WTT_Node * dpNonAlpha = new WTT_Node(BR_NONALPHA,0,0);
93 
94     dpNonAlpha->SetBranch(BR_NONALPHA,dpNonAlpha);
95     dpParsingTreeTop->SetBranch(BR_NONALPHA,dpNonAlpha);
96 
97     WTT_Node * dpBackslash = new WTT_Node(BR_BACKSLASH,dpNonAlpha,dpNonAlpha);
98     dpBackslash->SetBranch(BR_END,0);
99 
100     dpParsingTreeTop->SetBranch(BR_BACKSLASH,dpBackslash);
101     dpNonAlpha->SetBranch(BR_BACKSLASH,dpBackslash);
102 
103 
104     // Initialize character set:
105     SetCharSet(i_nWorkingCharSet);
106 
107     if (C_BR_ALPHABASE != BR_ALPHABASE || C_NR_OF_BRANCHES != BR_MAX)
108     {
109         fprintf(stderr, "Assertion failed: file %s line %d.", __FILE__,  __LINE__);
110         exit(1);
111     }
112 }
113 
114 void
SetCharSet(CharSet i_nWorkingCharSet)115 WordTransTree::SetCharSet(CharSet i_nWorkingCharSet)
116 {
117     ByteString sConvert("\xC4\xD6\xDC\xE4\xF6\xFC\xDF");
118     const u_char * pConvert = (const u_char * ) ( sConvert.Convert(RTL_TEXTENCODING_MS_1252, i_nWorkingCharSet).GetBuffer() );
119 
120     INT16 i = 0;
121     for ( ; i < C_NR_OF_POSSIBLE_CHARS; ++i )
122     {
123         cChar2Branch[i] = BR_NONALPHA;
124     }  // end for
125     for ( i = 'a'; i <= 'z'; ++i )
126     {
127         cChar2Branch[i] = BR_ALPHABASE + i - 'a';
128     }  // end for
129     for ( i = 'A'; i <= 'Z'; ++i )
130     {
131         cChar2Branch[i] = BR_ALPHABASE + i - 'A';
132     }  // end for
133     cChar2Branch[pConvert[0]] = BR_AE;
134     cChar2Branch[pConvert[1]] = BR_OE;
135     cChar2Branch[pConvert[2]] = BR_UE;
136     cChar2Branch[pConvert[3]] = BR_AE;
137     cChar2Branch[pConvert[4]] = BR_OE;
138     cChar2Branch[pConvert[5]] = BR_UE;
139     cChar2Branch[pConvert[6]] = BR_SZ;
140 
141     cChar2Branch[u_char('~')] = BR_HOTKEY;
142     cChar2Branch[u_char('&')] = BR_HOTKEY;
143 
144 
145     c_AE = pConvert[0];
146     c_OE = pConvert[1];
147     c_UE = pConvert[2];
148     c_ae = pConvert[3];
149     c_oe = pConvert[4];
150     c_ue = pConvert[5];
151 }
152 
~WordTransTree()153 WordTransTree::~WordTransTree()
154 {
155     delete dpParsingTreeTop;
156     if (sOutput != 0)
157         delete [] sOutput;
158 }
159 
160 void
AddWordPair(const ByteString & i_sOldString,const ByteString & i_sReplaceString)161 WordTransTree::AddWordPair( const ByteString &      i_sOldString,
162                             const ByteString &      i_sReplaceString )
163 {
164     if (i_sOldString.Len() == 0)
165         return;
166 
167     pCurParseNode = dpParsingTreeTop;
168     WTT_Node * pBranch = 0;
169     char cBranch = 0;
170 
171     for ( constr pOld = i_sOldString.GetBuffer();
172           *pOld != 0;
173           pOld++ )
174     {
175         cBranch = CalculateBranch(*pOld);
176         pBranch = pCurParseNode->GetNextNode(cBranch);
177         if (pBranch == 0 || pBranch == pUnknownAlpha)
178         {
179             pBranch = new WTT_Node(cBranch,0,pUnknownAlpha);
180             pCurParseNode->SetBranch(cBranch,pBranch);
181         }
182         pCurParseNode = pBranch;
183     }   // end for
184     pCurParseNode->SetAsTokenToReplace(i_sReplaceString);
185 }
186 
187 void
InitTransformation(const char * i_sInput,UINT32 i_nInputLength,UINT32 i_nOutputMaxLength)188 WordTransTree::InitTransformation( const char * i_sInput,
189                                    UINT32       i_nInputLength,
190                                    UINT32       i_nOutputMaxLength )
191 {
192     sInput = (const u_char *)i_sInput;
193     nInputLength = i_nInputLength;
194     pInputEnd = &sInput[i_nInputLength];
195 
196     pInputCurTokenStart = sInput;
197     pInputPosition = sInput;
198 
199     if (nOutputMaxLength < i_nOutputMaxLength)
200     {
201         if (sOutput != 0)
202             delete [] sOutput;
203         sOutput = new unsigned char[i_nOutputMaxLength];
204         nOutputMaxLength = i_nOutputMaxLength;
205     }
206     pOutputPosition = sOutput;
207 }
208 
209 /** pInputCurTokenStart and CurParseNode are updated just when
210     starting this function. After its end they must not be changed
211     till this functon is called again.
212     Outside this function pInputPositon and pOutputPosition are both
213     on the first not transformed char in their respective array.
214 **/
215 WordTransTree::E_Result
TransformNextToken()216 WordTransTree::TransformNextToken()
217 {
218     pInputCurTokenStart = pInputPosition;
219     pCurParseNode = dpParsingTreeTop;
220     cCurHotkey = 0;
221     eCurResult = OK;
222 
223     WTT_Node * pBranch = 0;
224     UINT8 cBranch = 0;
225 
226     for ( pCurParseNode = dpParsingTreeTop;
227           pInputPosition != pInputEnd;
228           ++pInputPosition )
229     {
230         cBranch = CalculateBranch(*pInputPosition);
231         pBranch = pCurParseNode->GetNextNode( cBranch );
232         if (pBranch != 0)
233         {
234             pCurParseNode = pBranch;
235         }
236         else
237         {
238             if (cBranch == BR_HOTKEY)   // current letter is '~' or '&'.
239             {
240                 // Logic of the following. There are 9 possible cases -
241                 // A = alphabetic letter, NA = non alphabetic, TB = token begin,
242                 // Eot = end of text:
243                 //   1. A~A          set hotkey to following letter, continue
244                 //   2. A~NA         token end
245                 //   3. A~Eot        token end
246                 //   4. NA~A         token end
247                 //   5. NA~NA        continue
248                 //   6. A~Eof        continue
249                 //   7. TB~A         set hotkey to following letter, continue
250                 //   8. TB~NA        continue
251                 //   9. TB~Eot       continue
252 
253                 // bNext and Prev are true, if there are alphabetic letters:
254                 sal_Bool bNext =  pInputPosition + 1 != pInputEnd
255                                     ?   CalculateBranch(pInputPosition[1]) >= BR_ALPHABASE
256                                     :   sal_False;
257                 sal_Bool bPrev = pCurParseNode->Value() >= BR_ALPHABASE;
258 
259                 if ( bNext && (bPrev || pCurParseNode == dpParsingTreeTop) )
260                 {   // case 1. and 7.
261                     Handle_Hotkey();
262                     continue;
263                 }
264                 else if  (!bPrev && !bNext)
265                 {   // case 5.,6.,8.,9.
266                     continue;
267                 }
268 
269                 // Case 2.,3.,4. :
270                 //  so this should be handled as an end of a token.
271             }
272             if (pCurParseNode->TokenType() == WTT_Node::token_to_keep)
273             {
274                 Handle_TokenToKeep();
275                 return eCurResult;
276             }
277             else
278             {
279                 Handle_TokenToTransform();
280                 return eCurResult;
281             }   // endif (pCurParseNode->TokenType() == WTT_Node::token_to_keep)
282         }   // endif (pBranch == 0) else
283     }   // end for
284 
285     // If here, the text end is reached
286     if (pCurParseNode->TokenType() == WTT_Node::token_to_keep)
287     {
288         Handle_TokenToKeep();
289         return eCurResult;
290     }
291     else
292     {
293         Handle_TokenToTransform();
294         return eCurResult;
295     }
296 }
297 
298 ByteString
CurReplacingString() const299 WordTransTree::CurReplacingString() const
300 {
301     return pCurParseNode->ReplaceString();
302 }
303 
304 void
Handle_Hotkey()305 WordTransTree::Handle_Hotkey()
306 {
307     if (cCurHotkey == 0)    // Avoid to replace the first found hotkey by
308                             //   a later one - though this shouldn't happen anyway.
309     {
310         cCurHotkey = (pInputPosition+1) != pInputEnd ? pInputPosition[1] : 0;
311         cCurHotkeySign = *pInputPosition;
312     }
313 }
314 
315 void
Handle_TokenToKeep()316 WordTransTree::Handle_TokenToKeep()
317 {
318     UINT32 nTokenLength = pInputPosition-pInputCurTokenStart;
319 
320     memcpy(pOutputPosition,pInputCurTokenStart,nTokenLength);
321 
322     pOutputPosition += nTokenLength;
323     *pOutputPosition = '\0';
324 }
325 
326 void
Handle_TokenToTransform()327 WordTransTree::Handle_TokenToTransform()
328 {
329     sal_Bool bHaveHotkey = CalculateBranch(cCurHotkey) >= BR_ALPHABASE;
330     const ByteString & rReplace = pCurParseNode->ReplaceString();
331 
332     // Find position of hotkey in replace-string:
333     sal_uInt16 nHotkeyPos = bHaveHotkey
334                             ?   rReplace.Search(char(cCurHotkey))
335                             :   STRING_NOTFOUND;
336     if (nHotkeyPos == STRING_NOTFOUND && bHaveHotkey)
337     {
338         if (cCurHotkey < 128)
339         {
340             if (islower(cCurHotkey))
341                 nHotkeyPos = rReplace.Search(toupper(char(cCurHotkey)));
342             else
343                 nHotkeyPos = rReplace.Search(tolower(char(cCurHotkey)));
344         }
345         else    // cCurHotkey >= 128
346         {
347             if (cCurHotkey == c_ae)
348                 nHotkeyPos = rReplace.Search(char(c_AE));
349             else if (cCurHotkey == c_oe)
350                 nHotkeyPos = rReplace.Search(char(c_OE));
351             else if (cCurHotkey == c_ue)
352                 nHotkeyPos = rReplace.Search(char(c_UE));
353             else if (cCurHotkey == c_AE)
354                 nHotkeyPos = rReplace.Search(char(c_ae));
355             else if (cCurHotkey == c_OE)
356                 nHotkeyPos = rReplace.Search(char(c_oe));
357             else if (cCurHotkey == c_UE)
358                 nHotkeyPos = rReplace.Search(char(c_ue));
359         }   // endif (cCurHotkey < 128) else
360 
361         if (nHotkeyPos == STRING_NOTFOUND)
362         {
363             eCurResult = HOTKEY_LOST;
364             bHaveHotkey = sal_False;
365         }
366     }   // endif (nHotkeyPos == STRING_NOT_FOUND && bHaveHotkey)
367 
368 
369     UINT32 nOutputTokenLength = rReplace.Len() + (bHaveHotkey ? 1 : 0);
370 
371     if (bHaveHotkey)
372     {
373         memcpy( pOutputPosition,
374                 pCurParseNode->ReplaceString().GetBuffer(),
375                 nHotkeyPos );
376         *(pOutputPosition + nHotkeyPos) = cCurHotkeySign;
377         memcpy( pOutputPosition + nHotkeyPos + 1,
378                 pCurParseNode->ReplaceString().GetBuffer() + nHotkeyPos,
379                 nOutputTokenLength - nHotkeyPos - 1);
380     }
381     else
382     {
383         memcpy( pOutputPosition,
384                 pCurParseNode->ReplaceString().GetBuffer(),
385                 nOutputTokenLength );
386     }
387 
388     // Convert first letter into upper if necessary:
389     u_char cInStart = CalculateBranch(*pInputCurTokenStart) == BR_HOTKEY
390                             ?   pInputCurTokenStart[1]
391                             :   pInputCurTokenStart[0] ;
392     u_char * pOutStart = nHotkeyPos == 0
393                             ?   pOutputPosition + 1
394                             :   pOutputPosition ;
395     if (isupper(cInStart) || cInStart > 127)
396     {   // Possibly cInStart is upper character:
397         if (isupper(cInStart) || cInStart == c_AE || cInStart == c_OE || cInStart == c_UE)
398         {   // Surely cInStart is upper character:
399             u_char cOutStart = *pOutStart;
400             if (cOutStart < 128)
401                 *pOutStart = toupper(cOutStart);
402             else if (cOutStart == c_ae)
403                 *pOutStart = c_AE;
404             else if (cOutStart == c_oe)
405                 *pOutStart = c_OE;
406             else if (cOutStart == c_ue)
407                 *pOutStart = c_UE;
408         }
409     }   // endif (isupper(cInStart) || cInStart > 127)
410 
411     pOutputPosition += nOutputTokenLength;
412     *pOutputPosition = '\0';
413 }
414 
415