1 /************************************************************** 2 * 3 * Licensed to the Apache Software Foundation (ASF) under one 4 * or more contributor license agreements. See the NOTICE file 5 * distributed with this work for additional information 6 * regarding copyright ownership. The ASF licenses this file 7 * to you under the Apache License, Version 2.0 (the 8 * "License"); you may not use this file except in compliance 9 * with the License. You may obtain a copy of the License at 10 * 11 * http://www.apache.org/licenses/LICENSE-2.0 12 * 13 * Unless required by applicable law or agreed to in writing, 14 * software distributed under the License is distributed on an 15 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 * KIND, either express or implied. See the License for the 17 * specific language governing permissions and limitations 18 * under the License. 19 * 20 *************************************************************/ 21 22 23 24 #include "convertgb18030.h" 25 #include "context.h" 26 #include "converter.h" 27 #include "tenchelp.h" 28 #include "unichars.h" 29 #include "rtl/alloc.h" 30 #include "rtl/textcvt.h" 31 #include "sal/types.h" 32 33 typedef enum 34 { 35 IMPL_GB_18030_TO_UNICODE_STATE_0, 36 IMPL_GB_18030_TO_UNICODE_STATE_1, 37 IMPL_GB_18030_TO_UNICODE_STATE_2, 38 IMPL_GB_18030_TO_UNICODE_STATE_3 39 } ImplGb18030ToUnicodeState; 40 41 typedef struct 42 { 43 ImplGb18030ToUnicodeState m_eState; 44 sal_uInt32 m_nCode; 45 } ImplGb18030ToUnicodeContext; 46 47 void * ImplCreateGb18030ToUnicodeContext(void) 48 { 49 void * pContext 50 = rtl_allocateMemory(sizeof (ImplGb18030ToUnicodeContext)); 51 ((ImplGb18030ToUnicodeContext *) pContext)->m_eState 52 = IMPL_GB_18030_TO_UNICODE_STATE_0; 53 return pContext; 54 } 55 56 void ImplResetGb18030ToUnicodeContext(void * pContext) 57 { 58 if (pContext) 59 ((ImplGb18030ToUnicodeContext *) pContext)->m_eState 60 = IMPL_GB_18030_TO_UNICODE_STATE_0; 61 } 62 63 sal_Size ImplConvertGb18030ToUnicode(ImplTextConverterData const * pData, 64 void * pContext, 65 sal_Char const * pSrcBuf, 66 sal_Size nSrcBytes, 67 sal_Unicode * pDestBuf, 68 sal_Size nDestChars, 69 sal_uInt32 nFlags, 70 sal_uInt32 * pInfo, 71 sal_Size * pSrcCvtBytes) 72 { 73 sal_Unicode const * pGb18030Data 74 = ((ImplGb18030ConverterData const *) pData)->m_pGb18030ToUnicodeData; 75 ImplGb180302000ToUnicodeRange const * pGb18030Ranges 76 = ((ImplGb18030ConverterData const *) pData)-> 77 m_pGb18030ToUnicodeRanges; 78 ImplGb18030ToUnicodeState eState = IMPL_GB_18030_TO_UNICODE_STATE_0; 79 sal_uInt32 nCode = 0; 80 sal_uInt32 nInfo = 0; 81 sal_Size nConverted = 0; 82 sal_Unicode * pDestBufPtr = pDestBuf; 83 sal_Unicode * pDestBufEnd = pDestBuf + nDestChars; 84 85 if (pContext) 86 { 87 eState = ((ImplGb18030ToUnicodeContext *) pContext)->m_eState; 88 nCode = ((ImplGb18030ToUnicodeContext *) pContext)->m_nCode; 89 } 90 91 for (; nConverted < nSrcBytes; ++nConverted) 92 { 93 sal_Bool bUndefined = sal_True; 94 sal_uInt32 nChar = *(sal_uChar const *) pSrcBuf++; 95 switch (eState) 96 { 97 case IMPL_GB_18030_TO_UNICODE_STATE_0: 98 if (nChar < 0x80) 99 if (pDestBufPtr != pDestBufEnd) 100 *pDestBufPtr++ = (sal_Unicode) nChar; 101 else 102 goto no_output; 103 else if (nChar == 0x80) 104 goto bad_input; 105 else if (nChar <= 0xFE) 106 { 107 nCode = nChar - 0x81; 108 eState = IMPL_GB_18030_TO_UNICODE_STATE_1; 109 } 110 else 111 { 112 bUndefined = sal_False; 113 goto bad_input; 114 } 115 break; 116 117 case IMPL_GB_18030_TO_UNICODE_STATE_1: 118 if (nChar >= 0x30 && nChar <= 0x39) 119 { 120 nCode = nCode * 10 + (nChar - 0x30); 121 eState = IMPL_GB_18030_TO_UNICODE_STATE_2; 122 } 123 else if ((nChar >= 0x40 && nChar <= 0x7E) 124 || (nChar >= 0x80 && nChar <= 0xFE)) 125 { 126 nCode = nCode * 190 + (nChar <= 0x7E ? nChar - 0x40 : 127 nChar - 0x80 + 63); 128 if (pDestBufPtr != pDestBufEnd) 129 *pDestBufPtr++ = pGb18030Data[nCode]; 130 else 131 goto no_output; 132 eState = IMPL_GB_18030_TO_UNICODE_STATE_0; 133 } 134 else 135 { 136 bUndefined = sal_False; 137 goto bad_input; 138 } 139 break; 140 141 case IMPL_GB_18030_TO_UNICODE_STATE_2: 142 if (nChar >= 0x81 && nChar <= 0xFE) 143 { 144 nCode = nCode * 126 + (nChar - 0x81); 145 eState = IMPL_GB_18030_TO_UNICODE_STATE_3; 146 } 147 else 148 { 149 bUndefined = sal_False; 150 goto bad_input; 151 } 152 break; 153 154 case IMPL_GB_18030_TO_UNICODE_STATE_3: 155 if (nChar >= 0x30 && nChar <= 0x39) 156 { 157 nCode = nCode * 10 + (nChar - 0x30); 158 159 /* 90 30 81 30 to E3 32 9A 35 maps to U+10000 to U+10FFFF: */ 160 if (nCode >= 189000 && nCode <= 1237575) 161 if (pDestBufEnd - pDestBufPtr >= 2) 162 { 163 nCode -= 189000 - 0x10000; 164 *pDestBufPtr++ 165 = (sal_Unicode) ImplGetHighSurrogate(nCode); 166 *pDestBufPtr++ 167 = (sal_Unicode) ImplGetLowSurrogate(nCode); 168 } 169 else 170 goto no_output; 171 else 172 { 173 ImplGb180302000ToUnicodeRange const * pRange 174 = pGb18030Ranges; 175 sal_uInt32 nFirstNonRange = 0; 176 for (;;) 177 { 178 if (pRange->m_nNonRangeDataIndex == -1) 179 goto bad_input; 180 else if (nCode < pRange->m_nFirstLinear) 181 { 182 if (pDestBufPtr != pDestBufEnd) 183 *pDestBufPtr++ 184 = pGb18030Data[ 185 pRange->m_nNonRangeDataIndex 186 + (nCode - nFirstNonRange)]; 187 else 188 goto no_output; 189 break; 190 } 191 else if (nCode < pRange->m_nPastLinear) 192 { 193 if (pDestBufPtr != pDestBufEnd) 194 *pDestBufPtr++ 195 = (sal_Unicode) 196 (pRange->m_nFirstUnicode 197 + (nCode 198 - pRange-> 199 m_nFirstLinear)); 200 else 201 goto no_output; 202 break; 203 } 204 nFirstNonRange = (pRange++)->m_nPastLinear; 205 } 206 } 207 eState = IMPL_GB_18030_TO_UNICODE_STATE_0; 208 } 209 else 210 { 211 bUndefined = sal_False; 212 goto bad_input; 213 } 214 break; 215 } 216 continue; 217 218 bad_input: 219 switch (ImplHandleBadInputTextToUnicodeConversion( 220 bUndefined, sal_True, 0, nFlags, &pDestBufPtr, pDestBufEnd, 221 &nInfo)) 222 { 223 case IMPL_BAD_INPUT_STOP: 224 eState = IMPL_GB_18030_TO_UNICODE_STATE_0; 225 break; 226 227 case IMPL_BAD_INPUT_CONTINUE: 228 eState = IMPL_GB_18030_TO_UNICODE_STATE_0; 229 continue; 230 231 case IMPL_BAD_INPUT_NO_OUTPUT: 232 goto no_output; 233 } 234 break; 235 236 no_output: 237 --pSrcBuf; 238 nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL; 239 break; 240 } 241 242 if (eState != IMPL_GB_18030_TO_UNICODE_STATE_0 243 && (nInfo & (RTL_TEXTTOUNICODE_INFO_ERROR 244 | RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL)) 245 == 0) 246 { 247 if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) == 0) 248 nInfo |= RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL; 249 else 250 switch (ImplHandleBadInputTextToUnicodeConversion( 251 sal_False, sal_True, 0, nFlags, &pDestBufPtr, 252 pDestBufEnd, &nInfo)) 253 { 254 case IMPL_BAD_INPUT_STOP: 255 case IMPL_BAD_INPUT_CONTINUE: 256 eState = IMPL_GB_18030_TO_UNICODE_STATE_0; 257 break; 258 259 case IMPL_BAD_INPUT_NO_OUTPUT: 260 nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL; 261 break; 262 } 263 } 264 265 if (pContext) 266 { 267 ((ImplGb18030ToUnicodeContext *) pContext)->m_eState = eState; 268 ((ImplGb18030ToUnicodeContext *) pContext)->m_nCode = nCode; 269 } 270 if (pInfo) 271 *pInfo = nInfo; 272 if (pSrcCvtBytes) 273 *pSrcCvtBytes = nConverted; 274 275 return pDestBufPtr - pDestBuf; 276 } 277 278 sal_Size ImplConvertUnicodeToGb18030(ImplTextConverterData const * pData, 279 void * pContext, 280 sal_Unicode const * pSrcBuf, 281 sal_Size nSrcChars, 282 sal_Char * pDestBuf, 283 sal_Size nDestBytes, 284 sal_uInt32 nFlags, 285 sal_uInt32 * pInfo, 286 sal_Size * pSrcCvtChars) 287 { 288 sal_uInt32 const * pGb18030Data 289 = ((ImplGb18030ConverterData const *) pData)-> 290 m_pUnicodeToGb18030Data; 291 ImplUnicodeToGb180302000Range const * pGb18030Ranges 292 = ((ImplGb18030ConverterData const *) pData)-> 293 m_pUnicodeToGb18030Ranges; 294 sal_Unicode nHighSurrogate = 0; 295 sal_uInt32 nInfo = 0; 296 sal_Size nConverted = 0; 297 sal_Char * pDestBufPtr = pDestBuf; 298 sal_Char * pDestBufEnd = pDestBuf + nDestBytes; 299 300 if (pContext) 301 nHighSurrogate 302 = ((ImplUnicodeToTextContext *) pContext)->m_nHighSurrogate; 303 304 for (; nConverted < nSrcChars; ++nConverted) 305 { 306 sal_Bool bUndefined = sal_True; 307 sal_uInt32 nChar = *pSrcBuf++; 308 if (nHighSurrogate == 0) 309 { 310 if (ImplIsHighSurrogate(nChar)) 311 { 312 nHighSurrogate = (sal_Unicode) nChar; 313 continue; 314 } 315 } 316 else if (ImplIsLowSurrogate(nChar)) 317 nChar = ImplCombineSurrogates(nHighSurrogate, nChar); 318 else 319 { 320 bUndefined = sal_False; 321 goto bad_input; 322 } 323 324 if (ImplIsLowSurrogate(nChar) || ImplIsNoncharacter(nChar)) 325 { 326 bUndefined = sal_False; 327 goto bad_input; 328 } 329 330 if (nChar < 0x80) 331 if (pDestBufPtr != pDestBufEnd) 332 *pDestBufPtr++ = (sal_Char) nChar; 333 else 334 goto no_output; 335 else if (nChar < 0x10000) 336 { 337 ImplUnicodeToGb180302000Range const * pRange = pGb18030Ranges; 338 sal_Unicode nFirstNonRange = 0x80; 339 for (;;) 340 { 341 if (nChar < pRange->m_nFirstUnicode) 342 { 343 sal_uInt32 nCode 344 = pGb18030Data[pRange->m_nNonRangeDataIndex 345 + (nChar - nFirstNonRange)]; 346 if (pDestBufEnd - pDestBufPtr 347 >= (nCode <= 0xFFFF ? 2 : 4)) 348 { 349 if (nCode > 0xFFFF) 350 { 351 *pDestBufPtr++ = (sal_Char) (nCode >> 24); 352 *pDestBufPtr++ = (sal_Char) (nCode >> 16 & 0xFF); 353 } 354 *pDestBufPtr++ = (sal_Char) (nCode >> 8 & 0xFF); 355 *pDestBufPtr++ = (sal_Char) (nCode & 0xFF); 356 } 357 else 358 goto no_output; 359 break; 360 } 361 else if (nChar <= pRange->m_nLastUnicode) 362 { 363 if (pDestBufEnd - pDestBufPtr >= 4) 364 { 365 sal_uInt32 nCode 366 = pRange->m_nFirstLinear 367 + (nChar - pRange->m_nFirstUnicode); 368 *pDestBufPtr++ = (sal_Char) (nCode / 12600 + 0x81); 369 *pDestBufPtr++ 370 = (sal_Char) (nCode / 1260 % 10 + 0x30); 371 *pDestBufPtr++ = (sal_Char) (nCode / 10 % 126 + 0x81); 372 *pDestBufPtr++ = (sal_Char) (nCode % 10 + 0x30); 373 } 374 else 375 goto no_output; 376 break; 377 } 378 nFirstNonRange 379 = (sal_Unicode) ((pRange++)->m_nLastUnicode + 1); 380 } 381 } 382 else 383 if (pDestBufEnd - pDestBufPtr >= 4) 384 { 385 sal_uInt32 nCode = nChar - 0x10000; 386 *pDestBufPtr++ = (sal_Char) (nCode / 12600 + 0x90); 387 *pDestBufPtr++ = (sal_Char) (nCode / 1260 % 10 + 0x30); 388 *pDestBufPtr++ = (sal_Char) (nCode / 10 % 126 + 0x81); 389 *pDestBufPtr++ = (sal_Char) (nCode % 10 + 0x30); 390 } 391 else 392 goto no_output; 393 nHighSurrogate = 0; 394 continue; 395 396 bad_input: 397 switch (ImplHandleBadInputUnicodeToTextConversion(bUndefined, 398 nChar, 399 nFlags, 400 &pDestBufPtr, 401 pDestBufEnd, 402 &nInfo, 403 NULL, 404 0, 405 NULL)) 406 { 407 case IMPL_BAD_INPUT_STOP: 408 nHighSurrogate = 0; 409 break; 410 411 case IMPL_BAD_INPUT_CONTINUE: 412 nHighSurrogate = 0; 413 continue; 414 415 case IMPL_BAD_INPUT_NO_OUTPUT: 416 goto no_output; 417 } 418 break; 419 420 no_output: 421 --pSrcBuf; 422 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL; 423 break; 424 } 425 426 if (nHighSurrogate != 0 427 && (nInfo & (RTL_UNICODETOTEXT_INFO_ERROR 428 | RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL)) 429 == 0) 430 { 431 if ((nFlags & RTL_UNICODETOTEXT_FLAGS_FLUSH) != 0) 432 nInfo |= RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL; 433 else 434 switch (ImplHandleBadInputUnicodeToTextConversion(sal_False, 435 0, 436 nFlags, 437 &pDestBufPtr, 438 pDestBufEnd, 439 &nInfo, 440 NULL, 441 0, 442 NULL)) 443 { 444 case IMPL_BAD_INPUT_STOP: 445 case IMPL_BAD_INPUT_CONTINUE: 446 nHighSurrogate = 0; 447 break; 448 449 case IMPL_BAD_INPUT_NO_OUTPUT: 450 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL; 451 break; 452 } 453 } 454 455 if (pContext) 456 ((ImplUnicodeToTextContext *) pContext)->m_nHighSurrogate 457 = nHighSurrogate; 458 if (pInfo) 459 *pInfo = nInfo; 460 if (pSrcCvtChars) 461 *pSrcCvtChars = nConverted; 462 463 return pDestBufPtr - pDestBuf; 464 } 465