1*cdf0e10cSrcweir /************************************************************************* 2*cdf0e10cSrcweir * 3*cdf0e10cSrcweir * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4*cdf0e10cSrcweir * 5*cdf0e10cSrcweir * Copyright 2000, 2010 Oracle and/or its affiliates. 6*cdf0e10cSrcweir * 7*cdf0e10cSrcweir * OpenOffice.org - a multi-platform office productivity suite 8*cdf0e10cSrcweir * 9*cdf0e10cSrcweir * This file is part of OpenOffice.org. 10*cdf0e10cSrcweir * 11*cdf0e10cSrcweir * OpenOffice.org is free software: you can redistribute it and/or modify 12*cdf0e10cSrcweir * it under the terms of the GNU Lesser General Public License version 3 13*cdf0e10cSrcweir * only, as published by the Free Software Foundation. 14*cdf0e10cSrcweir * 15*cdf0e10cSrcweir * OpenOffice.org is distributed in the hope that it will be useful, 16*cdf0e10cSrcweir * but WITHOUT ANY WARRANTY; without even the implied warranty of 17*cdf0e10cSrcweir * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18*cdf0e10cSrcweir * GNU Lesser General Public License version 3 for more details 19*cdf0e10cSrcweir * (a copy is included in the LICENSE file that accompanied this code). 20*cdf0e10cSrcweir * 21*cdf0e10cSrcweir * You should have received a copy of the GNU Lesser General Public License 22*cdf0e10cSrcweir * version 3 along with OpenOffice.org. If not, see 23*cdf0e10cSrcweir * <http://www.openoffice.org/license.html> 24*cdf0e10cSrcweir * for a copy of the LGPLv3 License. 25*cdf0e10cSrcweir * 26*cdf0e10cSrcweir ************************************************************************/ 27*cdf0e10cSrcweir 28*cdf0e10cSrcweir #include <stdlib.h> 29*cdf0e10cSrcweir #include <stdio.h> 30*cdf0e10cSrcweir #include <fcntl.h> 31*cdf0e10cSrcweir #include <errno.h> 32*cdf0e10cSrcweir #include <string.h> 33*cdf0e10cSrcweir #include <unistd.h> 34*cdf0e10cSrcweir #include <ctype.h> 35*cdf0e10cSrcweir #include <sal/alloca.h> 36*cdf0e10cSrcweir 37*cdf0e10cSrcweir #include <rtl/ustring.hxx> 38*cdf0e10cSrcweir 39*cdf0e10cSrcweir #include <map> 40*cdf0e10cSrcweir #include <string> 41*cdf0e10cSrcweir 42*cdf0e10cSrcweir /***************************************************************************** 43*cdf0e10cSrcweir * typedefs 44*cdf0e10cSrcweir *****************************************************************************/ 45*cdf0e10cSrcweir 46*cdf0e10cSrcweir typedef std::map< const std::string, rtl_TextEncoding > EncodingMap; 47*cdf0e10cSrcweir 48*cdf0e10cSrcweir struct _pair { 49*cdf0e10cSrcweir const char *key; 50*cdf0e10cSrcweir rtl_TextEncoding value; 51*cdf0e10cSrcweir }; 52*cdf0e10cSrcweir 53*cdf0e10cSrcweir static int _pair_compare (const char *key, const _pair *pair); 54*cdf0e10cSrcweir static const _pair* _pair_search (const char *key, const _pair *base, unsigned int member ); 55*cdf0e10cSrcweir 56*cdf0e10cSrcweir 57*cdf0e10cSrcweir const _pair _ms_encoding_list[] = { 58*cdf0e10cSrcweir { "0", RTL_TEXTENCODING_UTF8 }, 59*cdf0e10cSrcweir { "1250", RTL_TEXTENCODING_MS_1250 }, 60*cdf0e10cSrcweir { "1251", RTL_TEXTENCODING_MS_1251 }, 61*cdf0e10cSrcweir { "1252", RTL_TEXTENCODING_MS_1252 }, 62*cdf0e10cSrcweir { "1253", RTL_TEXTENCODING_MS_1253 }, 63*cdf0e10cSrcweir { "1254", RTL_TEXTENCODING_MS_1254 }, 64*cdf0e10cSrcweir { "1255", RTL_TEXTENCODING_MS_1255 }, 65*cdf0e10cSrcweir { "1256", RTL_TEXTENCODING_MS_1256 }, 66*cdf0e10cSrcweir { "1257", RTL_TEXTENCODING_MS_1257 }, 67*cdf0e10cSrcweir { "1258", RTL_TEXTENCODING_MS_1258 }, 68*cdf0e10cSrcweir { "874", RTL_TEXTENCODING_MS_874 }, 69*cdf0e10cSrcweir { "932", RTL_TEXTENCODING_MS_932 }, 70*cdf0e10cSrcweir { "936", RTL_TEXTENCODING_MS_936 }, 71*cdf0e10cSrcweir { "949", RTL_TEXTENCODING_MS_949 }, 72*cdf0e10cSrcweir { "950", RTL_TEXTENCODING_MS_950 } 73*cdf0e10cSrcweir }; 74*cdf0e10cSrcweir 75*cdf0e10cSrcweir 76*cdf0e10cSrcweir /***************************************************************************** 77*cdf0e10cSrcweir * fgets that work with unix line ends on Windows 78*cdf0e10cSrcweir *****************************************************************************/ 79*cdf0e10cSrcweir 80*cdf0e10cSrcweir char * my_fgets(char *s, int n, FILE *fp) 81*cdf0e10cSrcweir { 82*cdf0e10cSrcweir int i; 83*cdf0e10cSrcweir for( i=0; i < n-1; i++ ) 84*cdf0e10cSrcweir { 85*cdf0e10cSrcweir int c = getc(fp); 86*cdf0e10cSrcweir 87*cdf0e10cSrcweir if( c == EOF ) 88*cdf0e10cSrcweir break; 89*cdf0e10cSrcweir 90*cdf0e10cSrcweir s[i] = (char) c; 91*cdf0e10cSrcweir 92*cdf0e10cSrcweir if( s[i] == '\n' ) 93*cdf0e10cSrcweir { 94*cdf0e10cSrcweir i++; 95*cdf0e10cSrcweir break; 96*cdf0e10cSrcweir } 97*cdf0e10cSrcweir } 98*cdf0e10cSrcweir 99*cdf0e10cSrcweir if( i>0 ) 100*cdf0e10cSrcweir { 101*cdf0e10cSrcweir s[i] = '\0'; 102*cdf0e10cSrcweir return s; 103*cdf0e10cSrcweir } 104*cdf0e10cSrcweir else 105*cdf0e10cSrcweir { 106*cdf0e10cSrcweir return NULL; 107*cdf0e10cSrcweir } 108*cdf0e10cSrcweir } 109*cdf0e10cSrcweir 110*cdf0e10cSrcweir /***************************************************************************** 111*cdf0e10cSrcweir * compare function for binary search 112*cdf0e10cSrcweir *****************************************************************************/ 113*cdf0e10cSrcweir 114*cdf0e10cSrcweir static int 115*cdf0e10cSrcweir _pair_compare (const char *key, const _pair *pair) 116*cdf0e10cSrcweir { 117*cdf0e10cSrcweir int result = rtl_str_compareIgnoreAsciiCase( key, pair->key ); 118*cdf0e10cSrcweir return result; 119*cdf0e10cSrcweir } 120*cdf0e10cSrcweir 121*cdf0e10cSrcweir /***************************************************************************** 122*cdf0e10cSrcweir * binary search on encoding tables 123*cdf0e10cSrcweir *****************************************************************************/ 124*cdf0e10cSrcweir 125*cdf0e10cSrcweir static const _pair* 126*cdf0e10cSrcweir _pair_search (const char *key, const _pair *base, unsigned int member ) 127*cdf0e10cSrcweir { 128*cdf0e10cSrcweir unsigned int lower = 0; 129*cdf0e10cSrcweir unsigned int upper = member; 130*cdf0e10cSrcweir unsigned int current; 131*cdf0e10cSrcweir int comparison; 132*cdf0e10cSrcweir 133*cdf0e10cSrcweir /* check for validity of input */ 134*cdf0e10cSrcweir if ( (key == NULL) || (base == NULL) || (member == 0) ) 135*cdf0e10cSrcweir return NULL; 136*cdf0e10cSrcweir 137*cdf0e10cSrcweir /* binary search */ 138*cdf0e10cSrcweir while ( lower < upper ) 139*cdf0e10cSrcweir { 140*cdf0e10cSrcweir current = (lower + upper) / 2; 141*cdf0e10cSrcweir comparison = _pair_compare( key, base + current ); 142*cdf0e10cSrcweir if (comparison < 0) 143*cdf0e10cSrcweir upper = current; 144*cdf0e10cSrcweir else 145*cdf0e10cSrcweir if (comparison > 0) 146*cdf0e10cSrcweir lower = current + 1; 147*cdf0e10cSrcweir else 148*cdf0e10cSrcweir return base + current; 149*cdf0e10cSrcweir } 150*cdf0e10cSrcweir 151*cdf0e10cSrcweir return NULL; 152*cdf0e10cSrcweir } 153*cdf0e10cSrcweir 154*cdf0e10cSrcweir 155*cdf0e10cSrcweir /************************************************************************ 156*cdf0e10cSrcweir * read_encoding_table 157*cdf0e10cSrcweir ************************************************************************/ 158*cdf0e10cSrcweir 159*cdf0e10cSrcweir void read_encoding_table(char * file, EncodingMap& aEncodingMap) 160*cdf0e10cSrcweir { 161*cdf0e10cSrcweir FILE * fp = fopen(file, "r"); 162*cdf0e10cSrcweir if ( ! fp ) { 163*cdf0e10cSrcweir fprintf(stderr, "ulfconv: %s %s\n", file, strerror(errno)); 164*cdf0e10cSrcweir exit(2); 165*cdf0e10cSrcweir } 166*cdf0e10cSrcweir 167*cdf0e10cSrcweir char buffer[512]; 168*cdf0e10cSrcweir while ( NULL != my_fgets(buffer, sizeof(buffer), fp) ) { 169*cdf0e10cSrcweir 170*cdf0e10cSrcweir // strip comment lines 171*cdf0e10cSrcweir if ( buffer[0] == '#' ) 172*cdf0e10cSrcweir continue; 173*cdf0e10cSrcweir 174*cdf0e10cSrcweir // find end of language string 175*cdf0e10cSrcweir char * cp; 176*cdf0e10cSrcweir for ( cp = buffer; ! isspace(*cp); cp++ ) 177*cdf0e10cSrcweir ; 178*cdf0e10cSrcweir *cp = '\0'; 179*cdf0e10cSrcweir 180*cdf0e10cSrcweir // find start of codepage string 181*cdf0e10cSrcweir for ( ++cp; isspace(*cp); ++cp ) 182*cdf0e10cSrcweir ; 183*cdf0e10cSrcweir char * codepage = cp; 184*cdf0e10cSrcweir 185*cdf0e10cSrcweir // find end of codepage string 186*cdf0e10cSrcweir for ( ++cp; ! isspace(*cp); ++cp ) 187*cdf0e10cSrcweir ; 188*cdf0e10cSrcweir *cp = '\0'; 189*cdf0e10cSrcweir 190*cdf0e10cSrcweir // find the correct mapping for codepage 191*cdf0e10cSrcweir const unsigned int members = sizeof( _ms_encoding_list ) / sizeof( _pair ); 192*cdf0e10cSrcweir const _pair *encoding = _pair_search( codepage, _ms_encoding_list, members ); 193*cdf0e10cSrcweir 194*cdf0e10cSrcweir if ( encoding != NULL ) { 195*cdf0e10cSrcweir const std::string language(buffer); 196*cdf0e10cSrcweir aEncodingMap.insert( EncodingMap::value_type(language, encoding->value) ); 197*cdf0e10cSrcweir } 198*cdf0e10cSrcweir } 199*cdf0e10cSrcweir 200*cdf0e10cSrcweir fclose(fp); 201*cdf0e10cSrcweir } 202*cdf0e10cSrcweir 203*cdf0e10cSrcweir /************************************************************************ 204*cdf0e10cSrcweir * print_legacy_mixed 205*cdf0e10cSrcweir ************************************************************************/ 206*cdf0e10cSrcweir 207*cdf0e10cSrcweir void print_legacy_mixed( 208*cdf0e10cSrcweir FILE * ostream, 209*cdf0e10cSrcweir const rtl::OUString& aString, 210*cdf0e10cSrcweir const std::string& language, 211*cdf0e10cSrcweir EncodingMap& aEncodingMap) 212*cdf0e10cSrcweir { 213*cdf0e10cSrcweir EncodingMap::iterator iter = aEncodingMap.find(language); 214*cdf0e10cSrcweir 215*cdf0e10cSrcweir if ( iter != aEncodingMap.end() ) { 216*cdf0e10cSrcweir fputs(OUStringToOString(aString, iter->second).getStr(), ostream); 217*cdf0e10cSrcweir } else { 218*cdf0e10cSrcweir fprintf(stderr, "ulfconv: WARNING: no legacy encoding found for %s\n", language.c_str()); 219*cdf0e10cSrcweir } 220*cdf0e10cSrcweir } 221*cdf0e10cSrcweir 222*cdf0e10cSrcweir /************************************************************************ 223*cdf0e10cSrcweir * print_java_style 224*cdf0e10cSrcweir ************************************************************************/ 225*cdf0e10cSrcweir 226*cdf0e10cSrcweir void print_java_style(FILE * ostream, const rtl::OUString& aString) 227*cdf0e10cSrcweir { 228*cdf0e10cSrcweir int imax = aString.getLength(); 229*cdf0e10cSrcweir for (int i = 0; i < imax; i++) { 230*cdf0e10cSrcweir sal_Unicode uc = aString[i]; 231*cdf0e10cSrcweir if ( uc < 128 ) { 232*cdf0e10cSrcweir fprintf(ostream, "%c", (char) uc); 233*cdf0e10cSrcweir } else { 234*cdf0e10cSrcweir fprintf(ostream, "\\u%2.2x%2.2x", uc >> 8, uc & 0xFF ); 235*cdf0e10cSrcweir } 236*cdf0e10cSrcweir } 237*cdf0e10cSrcweir } 238*cdf0e10cSrcweir 239*cdf0e10cSrcweir /************************************************************************ 240*cdf0e10cSrcweir * main 241*cdf0e10cSrcweir ************************************************************************/ 242*cdf0e10cSrcweir 243*cdf0e10cSrcweir int main( int argc, char * const argv[] ) 244*cdf0e10cSrcweir { 245*cdf0e10cSrcweir EncodingMap aEncodingMap; 246*cdf0e10cSrcweir 247*cdf0e10cSrcweir FILE *istream = stdin; 248*cdf0e10cSrcweir FILE *ostream = stdout; 249*cdf0e10cSrcweir 250*cdf0e10cSrcweir char *outfile = NULL; 251*cdf0e10cSrcweir 252*cdf0e10cSrcweir int errflg = 0; 253*cdf0e10cSrcweir int argi; 254*cdf0e10cSrcweir 255*cdf0e10cSrcweir for( argi=1; argi < argc; argi++ ) 256*cdf0e10cSrcweir { 257*cdf0e10cSrcweir if( argv[argi][0] == '-' && argv[argi][2] == '\0' ) 258*cdf0e10cSrcweir { 259*cdf0e10cSrcweir switch(argv[argi][1]) { 260*cdf0e10cSrcweir case 'o': 261*cdf0e10cSrcweir if (argi+1 >= argc || argv[argi+1][0] == '-') 262*cdf0e10cSrcweir { 263*cdf0e10cSrcweir fprintf(stderr, "Option -%c requires an operand\n", argv[argi][1]); 264*cdf0e10cSrcweir errflg++; 265*cdf0e10cSrcweir break; 266*cdf0e10cSrcweir } 267*cdf0e10cSrcweir 268*cdf0e10cSrcweir ++argi; 269*cdf0e10cSrcweir outfile = argv[argi]; 270*cdf0e10cSrcweir break; 271*cdf0e10cSrcweir case 't': 272*cdf0e10cSrcweir if (argi+1 >= argc || argv[argi+1][0] == '-') 273*cdf0e10cSrcweir { 274*cdf0e10cSrcweir fprintf(stderr, "Option -%c requires an operand\n", argv[argi][1]); 275*cdf0e10cSrcweir errflg++; 276*cdf0e10cSrcweir break; 277*cdf0e10cSrcweir } 278*cdf0e10cSrcweir 279*cdf0e10cSrcweir read_encoding_table(argv[++argi], aEncodingMap); 280*cdf0e10cSrcweir break; 281*cdf0e10cSrcweir default: 282*cdf0e10cSrcweir fprintf(stderr, "Unrecognized option: -%c\n", argv[argi][1]); 283*cdf0e10cSrcweir errflg++; 284*cdf0e10cSrcweir } 285*cdf0e10cSrcweir } 286*cdf0e10cSrcweir else 287*cdf0e10cSrcweir { 288*cdf0e10cSrcweir break; 289*cdf0e10cSrcweir } 290*cdf0e10cSrcweir } 291*cdf0e10cSrcweir 292*cdf0e10cSrcweir if (errflg) { 293*cdf0e10cSrcweir fprintf(stderr, "Usage: ulfconv [-o <output file>] [-t <encoding table>] [<ulf file>]\n"); 294*cdf0e10cSrcweir exit(2); 295*cdf0e10cSrcweir } 296*cdf0e10cSrcweir 297*cdf0e10cSrcweir /* assign input file to stdin */ 298*cdf0e10cSrcweir if ( argi < argc ) 299*cdf0e10cSrcweir { 300*cdf0e10cSrcweir istream = fopen(argv[argi], "r"); 301*cdf0e10cSrcweir if ( istream == NULL ) { 302*cdf0e10cSrcweir fprintf(stderr, "ulfconv: %s : %s\n", argv[argi], strerror(errno)); 303*cdf0e10cSrcweir exit(2); 304*cdf0e10cSrcweir } 305*cdf0e10cSrcweir } 306*cdf0e10cSrcweir 307*cdf0e10cSrcweir /* open output file if any */ 308*cdf0e10cSrcweir if ( outfile ) 309*cdf0e10cSrcweir { 310*cdf0e10cSrcweir ostream = fopen(outfile, "w"); 311*cdf0e10cSrcweir if ( ostream == NULL ) { 312*cdf0e10cSrcweir fprintf(stderr, "ulfconv: %s : %s\n", outfile, strerror(errno)); 313*cdf0e10cSrcweir fclose(istream); 314*cdf0e10cSrcweir exit(2); 315*cdf0e10cSrcweir } 316*cdf0e10cSrcweir } 317*cdf0e10cSrcweir 318*cdf0e10cSrcweir /* read line by line from stdin */ 319*cdf0e10cSrcweir char buffer[65536]; 320*cdf0e10cSrcweir while ( NULL != fgets(buffer, sizeof(buffer), istream) ) { 321*cdf0e10cSrcweir 322*cdf0e10cSrcweir /* only handle lines containing " = " */ 323*cdf0e10cSrcweir char * cp = strstr(buffer, " = \""); 324*cdf0e10cSrcweir if ( cp ) { 325*cdf0e10cSrcweir rtl::OUString aString; 326*cdf0e10cSrcweir 327*cdf0e10cSrcweir /* find end of lang string */ 328*cdf0e10cSrcweir int n; 329*cdf0e10cSrcweir for ( n=0; ! isspace(buffer[n]); n++ ) 330*cdf0e10cSrcweir ; 331*cdf0e10cSrcweir 332*cdf0e10cSrcweir std::string line = buffer; 333*cdf0e10cSrcweir std::string lang(line, 0, n); 334*cdf0e10cSrcweir 335*cdf0e10cSrcweir cp += 4; 336*cdf0e10cSrcweir rtl_string2UString( &aString.pData, cp, strrchr(cp, '\"') - cp, 337*cdf0e10cSrcweir RTL_TEXTENCODING_UTF8, OSTRING_TO_OUSTRING_CVTFLAGS ); 338*cdf0e10cSrcweir 339*cdf0e10cSrcweir fprintf(ostream, "%s = \"", lang.c_str()); 340*cdf0e10cSrcweir 341*cdf0e10cSrcweir if ( aEncodingMap.empty() ) { 342*cdf0e10cSrcweir print_java_style(ostream, aString); 343*cdf0e10cSrcweir } else { 344*cdf0e10cSrcweir print_legacy_mixed(ostream, aString, lang, aEncodingMap); 345*cdf0e10cSrcweir } 346*cdf0e10cSrcweir 347*cdf0e10cSrcweir fprintf(ostream, "\"\n"); 348*cdf0e10cSrcweir 349*cdf0e10cSrcweir 350*cdf0e10cSrcweir } else { 351*cdf0e10cSrcweir fputs(buffer, ostream); 352*cdf0e10cSrcweir } 353*cdf0e10cSrcweir } 354*cdf0e10cSrcweir 355*cdf0e10cSrcweir fclose(ostream); 356*cdf0e10cSrcweir fclose(istream); 357*cdf0e10cSrcweir } 358