1 /************************************************************** 2 * 3 * Licensed to the Apache Software Foundation (ASF) under one 4 * or more contributor license agreements. See the NOTICE file 5 * distributed with this work for additional information 6 * regarding copyright ownership. The ASF licenses this file 7 * to you under the Apache License, Version 2.0 (the 8 * "License"); you may not use this file except in compliance 9 * with the License. You may obtain a copy of the License at 10 * 11 * http://www.apache.org/licenses/LICENSE-2.0 12 * 13 * Unless required by applicable law or agreed to in writing, 14 * software distributed under the License is distributed on an 15 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 * KIND, either express or implied. See the License for the 17 * specific language governing permissions and limitations 18 * under the License. 19 * 20 *************************************************************/ 21 22 23 24 #include "convertiso2022jp.h" 25 #include "context.h" 26 #include "converter.h" 27 #include "tenchelp.h" 28 #include "unichars.h" 29 #include "rtl/alloc.h" 30 #include "rtl/textcvt.h" 31 #include "sal/types.h" 32 33 typedef enum /* order is important: */ 34 { 35 IMPL_ISO_2022_JP_TO_UNICODE_STATE_ASCII, 36 IMPL_ISO_2022_JP_TO_UNICODE_STATE_JIS_ROMAN, 37 IMPL_ISO_2022_JP_TO_UNICODE_STATE_0208, 38 IMPL_ISO_2022_JP_TO_UNICODE_STATE_0208_2, 39 IMPL_ISO_2022_JP_TO_UNICODE_STATE_ESC, 40 IMPL_ISO_2022_JP_TO_UNICODE_STATE_ESC_LPAREN, 41 IMPL_ISO_2022_JP_TO_UNICODE_STATE_ESC_DOLLAR 42 } ImplIso2022JpToUnicodeState; 43 44 typedef struct 45 { 46 ImplIso2022JpToUnicodeState m_eState; 47 sal_uInt32 m_nRow; 48 } ImplIso2022JpToUnicodeContext; 49 50 typedef struct 51 { 52 sal_Unicode m_nHighSurrogate; 53 sal_Bool m_b0208; 54 } ImplUnicodeToIso2022JpContext; 55 56 void * ImplCreateIso2022JpToUnicodeContext(void) 57 { 58 void * pContext 59 = rtl_allocateMemory(sizeof (ImplIso2022JpToUnicodeContext)); 60 ((ImplIso2022JpToUnicodeContext *) pContext)->m_eState 61 = IMPL_ISO_2022_JP_TO_UNICODE_STATE_ASCII; 62 return pContext; 63 } 64 65 void ImplResetIso2022JpToUnicodeContext(void * pContext) 66 { 67 if (pContext) 68 ((ImplIso2022JpToUnicodeContext *) pContext)->m_eState 69 = IMPL_ISO_2022_JP_TO_UNICODE_STATE_ASCII; 70 } 71 72 sal_Size ImplConvertIso2022JpToUnicode(ImplTextConverterData const * pData, 73 void * pContext, 74 sal_Char const * pSrcBuf, 75 sal_Size nSrcBytes, 76 sal_Unicode * pDestBuf, 77 sal_Size nDestChars, 78 sal_uInt32 nFlags, 79 sal_uInt32 * pInfo, 80 sal_Size * pSrcCvtBytes) 81 { 82 ImplDBCSToUniLeadTab const * pJisX0208Data 83 = ((ImplIso2022JpConverterData const *) pData)-> 84 m_pJisX0208ToUnicodeData; 85 ImplIso2022JpToUnicodeState eState 86 = IMPL_ISO_2022_JP_TO_UNICODE_STATE_ASCII; 87 sal_uInt32 nRow = 0; 88 sal_uInt32 nInfo = 0; 89 sal_Size nConverted = 0; 90 sal_Unicode * pDestBufPtr = pDestBuf; 91 sal_Unicode * pDestBufEnd = pDestBuf + nDestChars; 92 93 if (pContext) 94 { 95 eState = ((ImplIso2022JpToUnicodeContext *) pContext)->m_eState; 96 nRow = ((ImplIso2022JpToUnicodeContext *) pContext)->m_nRow; 97 } 98 99 for (; nConverted < nSrcBytes; ++nConverted) 100 { 101 sal_Bool bUndefined = sal_True; 102 sal_uInt32 nChar = *(sal_uChar const *) pSrcBuf++; 103 switch (eState) 104 { 105 case IMPL_ISO_2022_JP_TO_UNICODE_STATE_ASCII: 106 if (nChar == 0x1B) /* ESC */ 107 eState = IMPL_ISO_2022_JP_TO_UNICODE_STATE_ESC; 108 else if (nChar < 0x80) 109 if (pDestBufPtr != pDestBufEnd) 110 *pDestBufPtr++ = (sal_Unicode) nChar; 111 else 112 goto no_output; 113 else 114 { 115 bUndefined = sal_False; 116 goto bad_input; 117 } 118 break; 119 120 case IMPL_ISO_2022_JP_TO_UNICODE_STATE_JIS_ROMAN: 121 if (nChar == 0x1B) /* ESC */ 122 eState = IMPL_ISO_2022_JP_TO_UNICODE_STATE_ESC; 123 else if (nChar < 0x80) 124 if (pDestBufPtr != pDestBufEnd) 125 { 126 switch (nChar) 127 { 128 case 0x5C: /* \ */ 129 nChar = 0xA5; /* YEN SIGN */ 130 break; 131 132 case 0x7E: /* ~ */ 133 nChar = 0xAF; /* MACRON */ 134 break; 135 } 136 *pDestBufPtr++ = (sal_Unicode) nChar; 137 } 138 else 139 goto no_output; 140 else 141 { 142 bUndefined = sal_False; 143 goto bad_input; 144 } 145 break; 146 147 case IMPL_ISO_2022_JP_TO_UNICODE_STATE_0208: 148 if (nChar == 0x1B) /* ESC */ 149 eState = IMPL_ISO_2022_JP_TO_UNICODE_STATE_ESC; 150 else if (nChar >= 0x21 && nChar <= 0x7E) 151 { 152 nRow = nChar; 153 eState = IMPL_ISO_2022_JP_TO_UNICODE_STATE_0208_2; 154 } 155 else 156 { 157 bUndefined = sal_False; 158 goto bad_input; 159 } 160 break; 161 162 case IMPL_ISO_2022_JP_TO_UNICODE_STATE_0208_2: 163 if (nChar >= 0x21 && nChar <= 0x7E) 164 { 165 sal_uInt16 nUnicode = 0; 166 sal_uInt32 nFirst = pJisX0208Data[nRow].mnTrailStart; 167 if (nChar >= nFirst 168 && nChar <= pJisX0208Data[nRow].mnTrailEnd) 169 nUnicode = pJisX0208Data[nRow]. 170 mpToUniTrailTab[nChar - nFirst]; 171 if (nUnicode != 0) 172 if (pDestBufPtr != pDestBufEnd) 173 { 174 *pDestBufPtr++ = (sal_Unicode) nUnicode; 175 eState = IMPL_ISO_2022_JP_TO_UNICODE_STATE_0208; 176 } 177 else 178 goto no_output; 179 else 180 goto bad_input; 181 } 182 else 183 { 184 bUndefined = sal_False; 185 goto bad_input; 186 } 187 break; 188 189 case IMPL_ISO_2022_JP_TO_UNICODE_STATE_ESC: 190 switch (nChar) 191 { 192 case 0x24: /* $ */ 193 eState = IMPL_ISO_2022_JP_TO_UNICODE_STATE_ESC_DOLLAR; 194 break; 195 196 case 0x28: /* ( */ 197 eState = IMPL_ISO_2022_JP_TO_UNICODE_STATE_ESC_LPAREN; 198 break; 199 200 default: 201 bUndefined = sal_False; 202 goto bad_input; 203 } 204 break; 205 206 case IMPL_ISO_2022_JP_TO_UNICODE_STATE_ESC_LPAREN: 207 switch (nChar) 208 { 209 case 0x42: /* A */ 210 eState = IMPL_ISO_2022_JP_TO_UNICODE_STATE_ASCII; 211 break; 212 213 case 0x4A: /* J */ 214 eState = IMPL_ISO_2022_JP_TO_UNICODE_STATE_JIS_ROMAN; 215 break; 216 217 default: 218 bUndefined = sal_False; 219 goto bad_input; 220 } 221 break; 222 223 case IMPL_ISO_2022_JP_TO_UNICODE_STATE_ESC_DOLLAR: 224 switch (nChar) 225 { 226 case 0x40: /* @ */ 227 case 0x42: /* B */ 228 eState = IMPL_ISO_2022_JP_TO_UNICODE_STATE_0208; 229 break; 230 231 default: 232 bUndefined = sal_False; 233 goto bad_input; 234 } 235 break; 236 } 237 continue; 238 239 bad_input: 240 switch (ImplHandleBadInputTextToUnicodeConversion( 241 bUndefined, sal_True, 0, nFlags, &pDestBufPtr, pDestBufEnd, 242 &nInfo)) 243 { 244 case IMPL_BAD_INPUT_STOP: 245 eState = IMPL_ISO_2022_JP_TO_UNICODE_STATE_ASCII; 246 break; 247 248 case IMPL_BAD_INPUT_CONTINUE: 249 eState = IMPL_ISO_2022_JP_TO_UNICODE_STATE_ASCII; 250 continue; 251 252 case IMPL_BAD_INPUT_NO_OUTPUT: 253 goto no_output; 254 } 255 break; 256 257 no_output: 258 --pSrcBuf; 259 nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL; 260 break; 261 } 262 263 if (eState > IMPL_ISO_2022_JP_TO_UNICODE_STATE_0208 264 && (nInfo & (RTL_TEXTTOUNICODE_INFO_ERROR 265 | RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL)) 266 == 0) 267 { 268 if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) == 0) 269 nInfo |= RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL; 270 else 271 switch (ImplHandleBadInputTextToUnicodeConversion( 272 sal_False, sal_True, 0, nFlags, &pDestBufPtr, pDestBufEnd, 273 &nInfo)) 274 { 275 case IMPL_BAD_INPUT_STOP: 276 case IMPL_BAD_INPUT_CONTINUE: 277 eState = IMPL_ISO_2022_JP_TO_UNICODE_STATE_ASCII; 278 break; 279 280 case IMPL_BAD_INPUT_NO_OUTPUT: 281 nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL; 282 break; 283 } 284 } 285 286 if (pContext) 287 { 288 ((ImplIso2022JpToUnicodeContext *) pContext)->m_eState = eState; 289 ((ImplIso2022JpToUnicodeContext *) pContext)->m_nRow = nRow; 290 } 291 if (pInfo) 292 *pInfo = nInfo; 293 if (pSrcCvtBytes) 294 *pSrcCvtBytes = nConverted; 295 296 return pDestBufPtr - pDestBuf; 297 } 298 299 void * ImplCreateUnicodeToIso2022JpContext(void) 300 { 301 void * pContext 302 = rtl_allocateMemory(sizeof (ImplUnicodeToIso2022JpContext)); 303 ((ImplUnicodeToIso2022JpContext *) pContext)->m_nHighSurrogate = 0; 304 ((ImplUnicodeToIso2022JpContext *) pContext)->m_b0208 = sal_False; 305 return pContext; 306 } 307 308 void ImplResetUnicodeToIso2022JpContext(void * pContext) 309 { 310 if (pContext) 311 { 312 ((ImplUnicodeToIso2022JpContext *) pContext)->m_nHighSurrogate = 0; 313 ((ImplUnicodeToIso2022JpContext *) pContext)->m_b0208 = sal_False; 314 } 315 } 316 317 sal_Size ImplConvertUnicodeToIso2022Jp(ImplTextConverterData const * pData, 318 void * pContext, 319 sal_Unicode const * pSrcBuf, 320 sal_Size nSrcChars, 321 sal_Char * pDestBuf, 322 sal_Size nDestBytes, 323 sal_uInt32 nFlags, 324 sal_uInt32 * pInfo, 325 sal_Size * pSrcCvtChars) 326 { 327 ImplUniToDBCSHighTab const * pJisX0208Data 328 = ((ImplIso2022JpConverterData const *) pData)-> 329 m_pUnicodeToJisX0208Data; 330 sal_Unicode nHighSurrogate = 0; 331 sal_Bool b0208 = sal_False; 332 sal_uInt32 nInfo = 0; 333 sal_Size nConverted = 0; 334 sal_Char * pDestBufPtr = pDestBuf; 335 sal_Char * pDestBufEnd = pDestBuf + nDestBytes; 336 sal_Bool bWritten; 337 338 if (pContext) 339 { 340 nHighSurrogate 341 = ((ImplUnicodeToIso2022JpContext *) pContext)->m_nHighSurrogate; 342 b0208 = ((ImplUnicodeToIso2022JpContext *) pContext)->m_b0208; 343 } 344 345 for (; nConverted < nSrcChars; ++nConverted) 346 { 347 sal_Bool bUndefined = sal_True; 348 sal_uInt32 nChar = *pSrcBuf++; 349 if (nHighSurrogate == 0) 350 { 351 if (ImplIsHighSurrogate(nChar)) 352 { 353 nHighSurrogate = (sal_Unicode) nChar; 354 continue; 355 } 356 } 357 else if (ImplIsLowSurrogate(nChar)) 358 nChar = ImplCombineSurrogates(nHighSurrogate, nChar); 359 else 360 { 361 bUndefined = sal_False; 362 goto bad_input; 363 } 364 365 if (ImplIsLowSurrogate(nChar) || ImplIsNoncharacter(nChar)) 366 { 367 bUndefined = sal_False; 368 goto bad_input; 369 } 370 371 if (nChar == 0x0A || nChar == 0x0D) /* LF, CR */ 372 { 373 if (b0208) 374 { 375 if (pDestBufEnd - pDestBufPtr >= 3) 376 { 377 *pDestBufPtr++ = 0x1B; /* ESC */ 378 *pDestBufPtr++ = 0x28; /* ( */ 379 *pDestBufPtr++ = 0x42; /* B */ 380 b0208 = sal_False; 381 } 382 else 383 goto no_output; 384 } 385 if (pDestBufPtr != pDestBufEnd) 386 *pDestBufPtr++ = (sal_Char) nChar; 387 else 388 goto no_output; 389 } 390 else if (nChar == 0x1B) 391 goto bad_input; 392 else if (nChar < 0x80) 393 { 394 if (b0208) 395 { 396 if (pDestBufEnd - pDestBufPtr >= 3) 397 { 398 *pDestBufPtr++ = 0x1B; /* ESC */ 399 *pDestBufPtr++ = 0x28; /* ( */ 400 *pDestBufPtr++ = 0x42; /* B */ 401 b0208 = sal_False; 402 } 403 else 404 goto no_output; 405 } 406 if (pDestBufPtr != pDestBufEnd) 407 *pDestBufPtr++ = (sal_Char) nChar; 408 else 409 goto no_output; 410 } 411 else 412 { 413 sal_uInt16 nBytes = 0; 414 sal_uInt32 nIndex1 = nChar >> 8; 415 if (nIndex1 < 0x100) 416 { 417 sal_uInt32 nIndex2 = nChar & 0xFF; 418 sal_uInt32 nFirst = pJisX0208Data[nIndex1].mnLowStart; 419 if (nIndex2 >= nFirst 420 && nIndex2 <= pJisX0208Data[nIndex1].mnLowEnd) 421 { 422 nBytes = pJisX0208Data[nIndex1]. 423 mpToUniTrailTab[nIndex2 - nFirst]; 424 if (nBytes == 0) 425 /* For some reason, the tables in tcvtjp4.tab do not 426 include these two conversions: */ 427 switch (nChar) 428 { 429 case 0xA5: /* YEN SIGN */ 430 nBytes = 0x216F; 431 break; 432 433 case 0xAF: /* MACRON */ 434 nBytes = 0x2131; 435 break; 436 } 437 } 438 } 439 if (nBytes != 0) 440 { 441 if (!b0208) 442 { 443 if (pDestBufEnd - pDestBufPtr >= 3) 444 { 445 *pDestBufPtr++ = 0x1B; /* ESC */ 446 *pDestBufPtr++ = 0x24; /* $ */ 447 *pDestBufPtr++ = 0x42; /* B */ 448 b0208 = sal_True; 449 } 450 else 451 goto no_output; 452 } 453 if (pDestBufEnd - pDestBufPtr >= 2) 454 { 455 *pDestBufPtr++ = (sal_Char) (nBytes >> 8); 456 *pDestBufPtr++ = (sal_Char) (nBytes & 0xFF); 457 } 458 else 459 goto no_output; 460 } 461 else 462 goto bad_input; 463 } 464 nHighSurrogate = 0; 465 continue; 466 467 bad_input: 468 switch (ImplHandleBadInputUnicodeToTextConversion( 469 bUndefined, 470 nChar, 471 nFlags, 472 &pDestBufPtr, 473 pDestBufEnd, 474 &nInfo, 475 "\x1B(B", 476 b0208 ? 3 : 0, 477 &bWritten)) 478 { 479 case IMPL_BAD_INPUT_STOP: 480 nHighSurrogate = 0; 481 break; 482 483 case IMPL_BAD_INPUT_CONTINUE: 484 if (bWritten) 485 b0208 = sal_False; 486 nHighSurrogate = 0; 487 continue; 488 489 case IMPL_BAD_INPUT_NO_OUTPUT: 490 goto no_output; 491 } 492 break; 493 494 no_output: 495 --pSrcBuf; 496 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL; 497 break; 498 } 499 500 if ((nInfo & (RTL_UNICODETOTEXT_INFO_ERROR 501 | RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL)) 502 == 0) 503 { 504 sal_Bool bFlush = sal_True; 505 if (nHighSurrogate != 0) 506 { 507 if ((nFlags & RTL_UNICODETOTEXT_FLAGS_FLUSH) != 0) 508 nInfo |= RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL; 509 else 510 switch (ImplHandleBadInputUnicodeToTextConversion( 511 sal_False, 512 0, 513 nFlags, 514 &pDestBufPtr, 515 pDestBufEnd, 516 &nInfo, 517 "\x1B(B", 518 b0208 ? 3 : 0, 519 &bWritten)) 520 { 521 case IMPL_BAD_INPUT_STOP: 522 nHighSurrogate = 0; 523 bFlush = sal_False; 524 break; 525 526 case IMPL_BAD_INPUT_CONTINUE: 527 if (bWritten) 528 b0208 = sal_False; 529 nHighSurrogate = 0; 530 break; 531 532 case IMPL_BAD_INPUT_NO_OUTPUT: 533 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL; 534 break; 535 } 536 } 537 if (bFlush 538 && b0208 539 && (nFlags & RTL_UNICODETOTEXT_FLAGS_FLUSH) != 0) 540 { 541 if (pDestBufEnd - pDestBufPtr >= 3) 542 { 543 *pDestBufPtr++ = 0x1B; /* ESC */ 544 *pDestBufPtr++ = 0x28; /* ( */ 545 *pDestBufPtr++ = 0x42; /* B */ 546 b0208 = sal_False; 547 } 548 else 549 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL; 550 } 551 } 552 553 if (pContext) 554 { 555 ((ImplUnicodeToIso2022JpContext *) pContext)->m_nHighSurrogate 556 = nHighSurrogate; 557 ((ImplUnicodeToIso2022JpContext *) pContext)->m_b0208 = b0208; 558 } 559 if (pInfo) 560 *pInfo = nInfo; 561 if (pSrcCvtChars) 562 *pSrcCvtChars = nConverted; 563 564 return pDestBufPtr - pDestBuf; 565 } 566