xref: /AOO41X/main/setup_native/source/ulfconv/ulfconv.cxx (revision cdf0e10c4e3984b49a9502b011690b615761d4a3)
1*cdf0e10cSrcweir /*************************************************************************
2*cdf0e10cSrcweir  *
3*cdf0e10cSrcweir  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4*cdf0e10cSrcweir  *
5*cdf0e10cSrcweir  * Copyright 2000, 2010 Oracle and/or its affiliates.
6*cdf0e10cSrcweir  *
7*cdf0e10cSrcweir  * OpenOffice.org - a multi-platform office productivity suite
8*cdf0e10cSrcweir  *
9*cdf0e10cSrcweir  * This file is part of OpenOffice.org.
10*cdf0e10cSrcweir  *
11*cdf0e10cSrcweir  * OpenOffice.org is free software: you can redistribute it and/or modify
12*cdf0e10cSrcweir  * it under the terms of the GNU Lesser General Public License version 3
13*cdf0e10cSrcweir  * only, as published by the Free Software Foundation.
14*cdf0e10cSrcweir  *
15*cdf0e10cSrcweir  * OpenOffice.org is distributed in the hope that it will be useful,
16*cdf0e10cSrcweir  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17*cdf0e10cSrcweir  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18*cdf0e10cSrcweir  * GNU Lesser General Public License version 3 for more details
19*cdf0e10cSrcweir  * (a copy is included in the LICENSE file that accompanied this code).
20*cdf0e10cSrcweir  *
21*cdf0e10cSrcweir  * You should have received a copy of the GNU Lesser General Public License
22*cdf0e10cSrcweir  * version 3 along with OpenOffice.org.  If not, see
23*cdf0e10cSrcweir  * <http://www.openoffice.org/license.html>
24*cdf0e10cSrcweir  * for a copy of the LGPLv3 License.
25*cdf0e10cSrcweir  *
26*cdf0e10cSrcweir  ************************************************************************/
27*cdf0e10cSrcweir 
28*cdf0e10cSrcweir #include <stdlib.h>
29*cdf0e10cSrcweir #include <stdio.h>
30*cdf0e10cSrcweir #include <fcntl.h>
31*cdf0e10cSrcweir #include <errno.h>
32*cdf0e10cSrcweir #include <string.h>
33*cdf0e10cSrcweir #include <unistd.h>
34*cdf0e10cSrcweir #include <ctype.h>
35*cdf0e10cSrcweir #include <sal/alloca.h>
36*cdf0e10cSrcweir 
37*cdf0e10cSrcweir #include <rtl/ustring.hxx>
38*cdf0e10cSrcweir 
39*cdf0e10cSrcweir #include <map>
40*cdf0e10cSrcweir #include <string>
41*cdf0e10cSrcweir 
42*cdf0e10cSrcweir /*****************************************************************************
43*cdf0e10cSrcweir  * typedefs
44*cdf0e10cSrcweir  *****************************************************************************/
45*cdf0e10cSrcweir 
46*cdf0e10cSrcweir typedef std::map< const std::string, rtl_TextEncoding > EncodingMap;
47*cdf0e10cSrcweir 
48*cdf0e10cSrcweir struct _pair {
49*cdf0e10cSrcweir     const char *key;
50*cdf0e10cSrcweir     rtl_TextEncoding value;
51*cdf0e10cSrcweir };
52*cdf0e10cSrcweir 
53*cdf0e10cSrcweir static int _pair_compare (const char *key, const _pair *pair);
54*cdf0e10cSrcweir static const _pair* _pair_search (const char *key, const _pair *base, unsigned int member );
55*cdf0e10cSrcweir 
56*cdf0e10cSrcweir 
57*cdf0e10cSrcweir const _pair _ms_encoding_list[] = {
58*cdf0e10cSrcweir     { "0",       RTL_TEXTENCODING_UTF8        },
59*cdf0e10cSrcweir     { "1250",    RTL_TEXTENCODING_MS_1250     },
60*cdf0e10cSrcweir     { "1251",    RTL_TEXTENCODING_MS_1251     },
61*cdf0e10cSrcweir     { "1252",    RTL_TEXTENCODING_MS_1252     },
62*cdf0e10cSrcweir     { "1253",    RTL_TEXTENCODING_MS_1253     },
63*cdf0e10cSrcweir     { "1254",    RTL_TEXTENCODING_MS_1254     },
64*cdf0e10cSrcweir     { "1255",    RTL_TEXTENCODING_MS_1255     },
65*cdf0e10cSrcweir     { "1256",    RTL_TEXTENCODING_MS_1256     },
66*cdf0e10cSrcweir     { "1257",    RTL_TEXTENCODING_MS_1257     },
67*cdf0e10cSrcweir     { "1258",    RTL_TEXTENCODING_MS_1258     },
68*cdf0e10cSrcweir     { "874",     RTL_TEXTENCODING_MS_874      },
69*cdf0e10cSrcweir     { "932",     RTL_TEXTENCODING_MS_932      },
70*cdf0e10cSrcweir     { "936",     RTL_TEXTENCODING_MS_936      },
71*cdf0e10cSrcweir     { "949",     RTL_TEXTENCODING_MS_949      },
72*cdf0e10cSrcweir     { "950",     RTL_TEXTENCODING_MS_950      }
73*cdf0e10cSrcweir };
74*cdf0e10cSrcweir 
75*cdf0e10cSrcweir 
76*cdf0e10cSrcweir /*****************************************************************************
77*cdf0e10cSrcweir  * fgets that work with unix line ends on Windows
78*cdf0e10cSrcweir  *****************************************************************************/
79*cdf0e10cSrcweir 
80*cdf0e10cSrcweir char * my_fgets(char *s, int n, FILE *fp)
81*cdf0e10cSrcweir {
82*cdf0e10cSrcweir     int i;
83*cdf0e10cSrcweir     for( i=0; i < n-1; i++ )
84*cdf0e10cSrcweir     {
85*cdf0e10cSrcweir         int c = getc(fp);
86*cdf0e10cSrcweir 
87*cdf0e10cSrcweir         if( c == EOF )
88*cdf0e10cSrcweir             break;
89*cdf0e10cSrcweir 
90*cdf0e10cSrcweir         s[i] = (char) c;
91*cdf0e10cSrcweir 
92*cdf0e10cSrcweir         if( s[i] == '\n' )
93*cdf0e10cSrcweir         {
94*cdf0e10cSrcweir             i++;
95*cdf0e10cSrcweir             break;
96*cdf0e10cSrcweir         }
97*cdf0e10cSrcweir     }
98*cdf0e10cSrcweir 
99*cdf0e10cSrcweir     if( i>0 )
100*cdf0e10cSrcweir     {
101*cdf0e10cSrcweir         s[i] = '\0';
102*cdf0e10cSrcweir         return s;
103*cdf0e10cSrcweir     }
104*cdf0e10cSrcweir     else
105*cdf0e10cSrcweir     {
106*cdf0e10cSrcweir         return NULL;
107*cdf0e10cSrcweir     }
108*cdf0e10cSrcweir }
109*cdf0e10cSrcweir 
110*cdf0e10cSrcweir /*****************************************************************************
111*cdf0e10cSrcweir  * compare function for binary search
112*cdf0e10cSrcweir  *****************************************************************************/
113*cdf0e10cSrcweir 
114*cdf0e10cSrcweir static int
115*cdf0e10cSrcweir _pair_compare (const char *key, const _pair *pair)
116*cdf0e10cSrcweir {
117*cdf0e10cSrcweir     int result = rtl_str_compareIgnoreAsciiCase( key, pair->key );
118*cdf0e10cSrcweir     return result;
119*cdf0e10cSrcweir }
120*cdf0e10cSrcweir 
121*cdf0e10cSrcweir /*****************************************************************************
122*cdf0e10cSrcweir  * binary search on encoding tables
123*cdf0e10cSrcweir  *****************************************************************************/
124*cdf0e10cSrcweir 
125*cdf0e10cSrcweir static const _pair*
126*cdf0e10cSrcweir _pair_search (const char *key, const _pair *base, unsigned int member )
127*cdf0e10cSrcweir {
128*cdf0e10cSrcweir     unsigned int lower = 0;
129*cdf0e10cSrcweir     unsigned int upper = member;
130*cdf0e10cSrcweir     unsigned int current;
131*cdf0e10cSrcweir     int comparison;
132*cdf0e10cSrcweir 
133*cdf0e10cSrcweir     /* check for validity of input */
134*cdf0e10cSrcweir     if ( (key == NULL) || (base == NULL) || (member == 0) )
135*cdf0e10cSrcweir         return NULL;
136*cdf0e10cSrcweir 
137*cdf0e10cSrcweir     /* binary search */
138*cdf0e10cSrcweir     while ( lower < upper )
139*cdf0e10cSrcweir     {
140*cdf0e10cSrcweir         current = (lower + upper) / 2;
141*cdf0e10cSrcweir         comparison = _pair_compare( key, base + current );
142*cdf0e10cSrcweir         if (comparison < 0)
143*cdf0e10cSrcweir             upper = current;
144*cdf0e10cSrcweir         else
145*cdf0e10cSrcweir         if (comparison > 0)
146*cdf0e10cSrcweir             lower = current + 1;
147*cdf0e10cSrcweir         else
148*cdf0e10cSrcweir             return base + current;
149*cdf0e10cSrcweir     }
150*cdf0e10cSrcweir 
151*cdf0e10cSrcweir     return NULL;
152*cdf0e10cSrcweir }
153*cdf0e10cSrcweir 
154*cdf0e10cSrcweir 
155*cdf0e10cSrcweir /************************************************************************
156*cdf0e10cSrcweir  * read_encoding_table
157*cdf0e10cSrcweir  ************************************************************************/
158*cdf0e10cSrcweir 
159*cdf0e10cSrcweir void read_encoding_table(char * file, EncodingMap& aEncodingMap)
160*cdf0e10cSrcweir {
161*cdf0e10cSrcweir     FILE * fp = fopen(file, "r");
162*cdf0e10cSrcweir     if ( ! fp  ) {
163*cdf0e10cSrcweir         fprintf(stderr, "ulfconv: %s %s\n", file, strerror(errno));
164*cdf0e10cSrcweir         exit(2);
165*cdf0e10cSrcweir     }
166*cdf0e10cSrcweir 
167*cdf0e10cSrcweir     char buffer[512];
168*cdf0e10cSrcweir     while ( NULL != my_fgets(buffer, sizeof(buffer), fp) ) {
169*cdf0e10cSrcweir 
170*cdf0e10cSrcweir         // strip comment lines
171*cdf0e10cSrcweir         if ( buffer[0] == '#' )
172*cdf0e10cSrcweir             continue;
173*cdf0e10cSrcweir 
174*cdf0e10cSrcweir         // find end of language string
175*cdf0e10cSrcweir         char * cp;
176*cdf0e10cSrcweir         for ( cp = buffer; ! isspace(*cp); cp++ )
177*cdf0e10cSrcweir             ;
178*cdf0e10cSrcweir         *cp = '\0';
179*cdf0e10cSrcweir 
180*cdf0e10cSrcweir         // find start of codepage string
181*cdf0e10cSrcweir         for ( ++cp; isspace(*cp); ++cp )
182*cdf0e10cSrcweir             ;
183*cdf0e10cSrcweir         char * codepage = cp;
184*cdf0e10cSrcweir 
185*cdf0e10cSrcweir         // find end of codepage string
186*cdf0e10cSrcweir         for ( ++cp; ! isspace(*cp); ++cp )
187*cdf0e10cSrcweir             ;
188*cdf0e10cSrcweir         *cp = '\0';
189*cdf0e10cSrcweir 
190*cdf0e10cSrcweir         // find the correct mapping for codepage
191*cdf0e10cSrcweir         const unsigned int members = sizeof( _ms_encoding_list ) / sizeof( _pair );
192*cdf0e10cSrcweir         const _pair *encoding = _pair_search( codepage, _ms_encoding_list, members );
193*cdf0e10cSrcweir 
194*cdf0e10cSrcweir         if ( encoding != NULL ) {
195*cdf0e10cSrcweir             const std::string language(buffer);
196*cdf0e10cSrcweir             aEncodingMap.insert( EncodingMap::value_type(language, encoding->value) );
197*cdf0e10cSrcweir         }
198*cdf0e10cSrcweir     }
199*cdf0e10cSrcweir 
200*cdf0e10cSrcweir     fclose(fp);
201*cdf0e10cSrcweir }
202*cdf0e10cSrcweir 
203*cdf0e10cSrcweir /************************************************************************
204*cdf0e10cSrcweir  * print_legacy_mixed
205*cdf0e10cSrcweir  ************************************************************************/
206*cdf0e10cSrcweir 
207*cdf0e10cSrcweir void print_legacy_mixed(
208*cdf0e10cSrcweir     FILE * ostream,
209*cdf0e10cSrcweir     const rtl::OUString& aString,
210*cdf0e10cSrcweir     const std::string& language,
211*cdf0e10cSrcweir     EncodingMap& aEncodingMap)
212*cdf0e10cSrcweir {
213*cdf0e10cSrcweir     EncodingMap::iterator iter = aEncodingMap.find(language);
214*cdf0e10cSrcweir 
215*cdf0e10cSrcweir     if ( iter != aEncodingMap.end() ) {
216*cdf0e10cSrcweir         fputs(OUStringToOString(aString, iter->second).getStr(), ostream);
217*cdf0e10cSrcweir     } else {
218*cdf0e10cSrcweir         fprintf(stderr, "ulfconv: WARNING: no legacy encoding found for %s\n", language.c_str());
219*cdf0e10cSrcweir     }
220*cdf0e10cSrcweir }
221*cdf0e10cSrcweir 
222*cdf0e10cSrcweir /************************************************************************
223*cdf0e10cSrcweir  * print_java_style
224*cdf0e10cSrcweir  ************************************************************************/
225*cdf0e10cSrcweir 
226*cdf0e10cSrcweir void print_java_style(FILE * ostream, const rtl::OUString& aString)
227*cdf0e10cSrcweir {
228*cdf0e10cSrcweir     int imax = aString.getLength();
229*cdf0e10cSrcweir     for (int i = 0; i < imax; i++) {
230*cdf0e10cSrcweir         sal_Unicode uc = aString[i];
231*cdf0e10cSrcweir         if ( uc < 128 ) {
232*cdf0e10cSrcweir             fprintf(ostream, "%c", (char) uc);
233*cdf0e10cSrcweir         } else {
234*cdf0e10cSrcweir             fprintf(ostream, "\\u%2.2x%2.2x", uc >> 8, uc & 0xFF );
235*cdf0e10cSrcweir         }
236*cdf0e10cSrcweir     }
237*cdf0e10cSrcweir }
238*cdf0e10cSrcweir 
239*cdf0e10cSrcweir /************************************************************************
240*cdf0e10cSrcweir  * main
241*cdf0e10cSrcweir  ************************************************************************/
242*cdf0e10cSrcweir 
243*cdf0e10cSrcweir int main( int argc, char * const argv[] )
244*cdf0e10cSrcweir {
245*cdf0e10cSrcweir     EncodingMap aEncodingMap;
246*cdf0e10cSrcweir 
247*cdf0e10cSrcweir     FILE *istream = stdin;
248*cdf0e10cSrcweir     FILE *ostream = stdout;
249*cdf0e10cSrcweir 
250*cdf0e10cSrcweir     char *outfile = NULL;
251*cdf0e10cSrcweir 
252*cdf0e10cSrcweir     int errflg = 0;
253*cdf0e10cSrcweir     int argi;
254*cdf0e10cSrcweir 
255*cdf0e10cSrcweir     for( argi=1; argi < argc; argi++ )
256*cdf0e10cSrcweir     {
257*cdf0e10cSrcweir         if( argv[argi][0] == '-' && argv[argi][2] == '\0' )
258*cdf0e10cSrcweir         {
259*cdf0e10cSrcweir             switch(argv[argi][1]) {
260*cdf0e10cSrcweir             case 'o':
261*cdf0e10cSrcweir                 if (argi+1 >= argc || argv[argi+1][0] == '-')
262*cdf0e10cSrcweir                 {
263*cdf0e10cSrcweir                     fprintf(stderr, "Option -%c requires an operand\n", argv[argi][1]);
264*cdf0e10cSrcweir                     errflg++;
265*cdf0e10cSrcweir                     break;
266*cdf0e10cSrcweir                 }
267*cdf0e10cSrcweir 
268*cdf0e10cSrcweir                 ++argi;
269*cdf0e10cSrcweir                 outfile = argv[argi];
270*cdf0e10cSrcweir                 break;
271*cdf0e10cSrcweir             case 't':
272*cdf0e10cSrcweir                 if (argi+1 >= argc || argv[argi+1][0] == '-')
273*cdf0e10cSrcweir                 {
274*cdf0e10cSrcweir                     fprintf(stderr, "Option -%c requires an operand\n", argv[argi][1]);
275*cdf0e10cSrcweir                     errflg++;
276*cdf0e10cSrcweir                     break;
277*cdf0e10cSrcweir                 }
278*cdf0e10cSrcweir 
279*cdf0e10cSrcweir                 read_encoding_table(argv[++argi], aEncodingMap);
280*cdf0e10cSrcweir                 break;
281*cdf0e10cSrcweir             default:
282*cdf0e10cSrcweir                 fprintf(stderr, "Unrecognized option: -%c\n", argv[argi][1]);
283*cdf0e10cSrcweir                 errflg++;
284*cdf0e10cSrcweir             }
285*cdf0e10cSrcweir         }
286*cdf0e10cSrcweir         else
287*cdf0e10cSrcweir         {
288*cdf0e10cSrcweir             break;
289*cdf0e10cSrcweir         }
290*cdf0e10cSrcweir     }
291*cdf0e10cSrcweir 
292*cdf0e10cSrcweir     if (errflg) {
293*cdf0e10cSrcweir       fprintf(stderr, "Usage: ulfconv [-o <output file>] [-t <encoding table>] [<ulf file>]\n");
294*cdf0e10cSrcweir       exit(2);
295*cdf0e10cSrcweir     }
296*cdf0e10cSrcweir 
297*cdf0e10cSrcweir     /* assign input file to stdin */
298*cdf0e10cSrcweir     if ( argi < argc )
299*cdf0e10cSrcweir     {
300*cdf0e10cSrcweir         istream = fopen(argv[argi], "r");
301*cdf0e10cSrcweir         if ( istream  == NULL ) {
302*cdf0e10cSrcweir             fprintf(stderr, "ulfconv: %s : %s\n", argv[argi], strerror(errno));
303*cdf0e10cSrcweir             exit(2);
304*cdf0e10cSrcweir         }
305*cdf0e10cSrcweir     }
306*cdf0e10cSrcweir 
307*cdf0e10cSrcweir 	/* open output file if any */
308*cdf0e10cSrcweir 	if ( outfile )
309*cdf0e10cSrcweir 	{
310*cdf0e10cSrcweir         ostream = fopen(outfile, "w");
311*cdf0e10cSrcweir         if ( ostream == NULL ) {
312*cdf0e10cSrcweir             fprintf(stderr, "ulfconv: %s : %s\n", outfile, strerror(errno));
313*cdf0e10cSrcweir             fclose(istream);
314*cdf0e10cSrcweir             exit(2);
315*cdf0e10cSrcweir         }
316*cdf0e10cSrcweir 	}
317*cdf0e10cSrcweir 
318*cdf0e10cSrcweir     /* read line by line from stdin */
319*cdf0e10cSrcweir     char buffer[65536];
320*cdf0e10cSrcweir     while ( NULL != fgets(buffer, sizeof(buffer), istream) ) {
321*cdf0e10cSrcweir 
322*cdf0e10cSrcweir         /* only handle lines containing " = " */
323*cdf0e10cSrcweir         char * cp = strstr(buffer, " = \"");
324*cdf0e10cSrcweir         if ( cp ) {
325*cdf0e10cSrcweir             rtl::OUString aString;
326*cdf0e10cSrcweir 
327*cdf0e10cSrcweir             /* find end of lang string */
328*cdf0e10cSrcweir             int n;
329*cdf0e10cSrcweir             for ( n=0; ! isspace(buffer[n]); n++ )
330*cdf0e10cSrcweir                 ;
331*cdf0e10cSrcweir 
332*cdf0e10cSrcweir             std::string line = buffer;
333*cdf0e10cSrcweir             std::string lang(line, 0, n);
334*cdf0e10cSrcweir 
335*cdf0e10cSrcweir             cp += 4;
336*cdf0e10cSrcweir             rtl_string2UString( &aString.pData, cp, strrchr(cp, '\"') - cp,
337*cdf0e10cSrcweir                 RTL_TEXTENCODING_UTF8, OSTRING_TO_OUSTRING_CVTFLAGS );
338*cdf0e10cSrcweir 
339*cdf0e10cSrcweir             fprintf(ostream, "%s = \"", lang.c_str());
340*cdf0e10cSrcweir 
341*cdf0e10cSrcweir             if ( aEncodingMap.empty() ) {
342*cdf0e10cSrcweir                 print_java_style(ostream, aString);
343*cdf0e10cSrcweir             } else {
344*cdf0e10cSrcweir                 print_legacy_mixed(ostream, aString, lang, aEncodingMap);
345*cdf0e10cSrcweir             }
346*cdf0e10cSrcweir 
347*cdf0e10cSrcweir             fprintf(ostream, "\"\n");
348*cdf0e10cSrcweir 
349*cdf0e10cSrcweir 
350*cdf0e10cSrcweir         } else {
351*cdf0e10cSrcweir             fputs(buffer, ostream);
352*cdf0e10cSrcweir         }
353*cdf0e10cSrcweir     }
354*cdf0e10cSrcweir 
355*cdf0e10cSrcweir     fclose(ostream);
356*cdf0e10cSrcweir     fclose(istream);
357*cdf0e10cSrcweir }
358