xref: /AOO41X/main/setup_native/source/ulfconv/ulfconv.cxx (revision 32b1fd08cf0851da51c0ed68f50bc63c4ee660e0)
1*32b1fd08SAndrew Rist /**************************************************************
2cdf0e10cSrcweir  *
3*32b1fd08SAndrew Rist  * Licensed to the Apache Software Foundation (ASF) under one
4*32b1fd08SAndrew Rist  * or more contributor license agreements.  See the NOTICE file
5*32b1fd08SAndrew Rist  * distributed with this work for additional information
6*32b1fd08SAndrew Rist  * regarding copyright ownership.  The ASF licenses this file
7*32b1fd08SAndrew Rist  * to you under the Apache License, Version 2.0 (the
8*32b1fd08SAndrew Rist  * "License"); you may not use this file except in compliance
9*32b1fd08SAndrew Rist  * with the License.  You may obtain a copy of the License at
10cdf0e10cSrcweir  *
11*32b1fd08SAndrew Rist  *   http://www.apache.org/licenses/LICENSE-2.0
12cdf0e10cSrcweir  *
13*32b1fd08SAndrew Rist  * Unless required by applicable law or agreed to in writing,
14*32b1fd08SAndrew Rist  * software distributed under the License is distributed on an
15*32b1fd08SAndrew Rist  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16*32b1fd08SAndrew Rist  * KIND, either express or implied.  See the License for the
17*32b1fd08SAndrew Rist  * specific language governing permissions and limitations
18*32b1fd08SAndrew Rist  * under the License.
19cdf0e10cSrcweir  *
20*32b1fd08SAndrew Rist  *************************************************************/
21*32b1fd08SAndrew Rist 
22*32b1fd08SAndrew Rist 
23cdf0e10cSrcweir 
24cdf0e10cSrcweir #include <stdlib.h>
25cdf0e10cSrcweir #include <stdio.h>
26cdf0e10cSrcweir #include <fcntl.h>
27cdf0e10cSrcweir #include <errno.h>
28cdf0e10cSrcweir #include <string.h>
29cdf0e10cSrcweir #include <unistd.h>
30cdf0e10cSrcweir #include <ctype.h>
31cdf0e10cSrcweir #include <sal/alloca.h>
32cdf0e10cSrcweir 
33cdf0e10cSrcweir #include <rtl/ustring.hxx>
34cdf0e10cSrcweir 
35cdf0e10cSrcweir #include <map>
36cdf0e10cSrcweir #include <string>
37cdf0e10cSrcweir 
38cdf0e10cSrcweir /*****************************************************************************
39cdf0e10cSrcweir  * typedefs
40cdf0e10cSrcweir  *****************************************************************************/
41cdf0e10cSrcweir 
42cdf0e10cSrcweir typedef std::map< const std::string, rtl_TextEncoding > EncodingMap;
43cdf0e10cSrcweir 
44cdf0e10cSrcweir struct _pair {
45cdf0e10cSrcweir     const char *key;
46cdf0e10cSrcweir     rtl_TextEncoding value;
47cdf0e10cSrcweir };
48cdf0e10cSrcweir 
49cdf0e10cSrcweir static int _pair_compare (const char *key, const _pair *pair);
50cdf0e10cSrcweir static const _pair* _pair_search (const char *key, const _pair *base, unsigned int member );
51cdf0e10cSrcweir 
52cdf0e10cSrcweir 
53cdf0e10cSrcweir const _pair _ms_encoding_list[] = {
54cdf0e10cSrcweir     { "0",       RTL_TEXTENCODING_UTF8        },
55cdf0e10cSrcweir     { "1250",    RTL_TEXTENCODING_MS_1250     },
56cdf0e10cSrcweir     { "1251",    RTL_TEXTENCODING_MS_1251     },
57cdf0e10cSrcweir     { "1252",    RTL_TEXTENCODING_MS_1252     },
58cdf0e10cSrcweir     { "1253",    RTL_TEXTENCODING_MS_1253     },
59cdf0e10cSrcweir     { "1254",    RTL_TEXTENCODING_MS_1254     },
60cdf0e10cSrcweir     { "1255",    RTL_TEXTENCODING_MS_1255     },
61cdf0e10cSrcweir     { "1256",    RTL_TEXTENCODING_MS_1256     },
62cdf0e10cSrcweir     { "1257",    RTL_TEXTENCODING_MS_1257     },
63cdf0e10cSrcweir     { "1258",    RTL_TEXTENCODING_MS_1258     },
64cdf0e10cSrcweir     { "874",     RTL_TEXTENCODING_MS_874      },
65cdf0e10cSrcweir     { "932",     RTL_TEXTENCODING_MS_932      },
66cdf0e10cSrcweir     { "936",     RTL_TEXTENCODING_MS_936      },
67cdf0e10cSrcweir     { "949",     RTL_TEXTENCODING_MS_949      },
68cdf0e10cSrcweir     { "950",     RTL_TEXTENCODING_MS_950      }
69cdf0e10cSrcweir };
70cdf0e10cSrcweir 
71cdf0e10cSrcweir 
72cdf0e10cSrcweir /*****************************************************************************
73cdf0e10cSrcweir  * fgets that work with unix line ends on Windows
74cdf0e10cSrcweir  *****************************************************************************/
75cdf0e10cSrcweir 
my_fgets(char * s,int n,FILE * fp)76cdf0e10cSrcweir char * my_fgets(char *s, int n, FILE *fp)
77cdf0e10cSrcweir {
78cdf0e10cSrcweir     int i;
79cdf0e10cSrcweir     for( i=0; i < n-1; i++ )
80cdf0e10cSrcweir     {
81cdf0e10cSrcweir         int c = getc(fp);
82cdf0e10cSrcweir 
83cdf0e10cSrcweir         if( c == EOF )
84cdf0e10cSrcweir             break;
85cdf0e10cSrcweir 
86cdf0e10cSrcweir         s[i] = (char) c;
87cdf0e10cSrcweir 
88cdf0e10cSrcweir         if( s[i] == '\n' )
89cdf0e10cSrcweir         {
90cdf0e10cSrcweir             i++;
91cdf0e10cSrcweir             break;
92cdf0e10cSrcweir         }
93cdf0e10cSrcweir     }
94cdf0e10cSrcweir 
95cdf0e10cSrcweir     if( i>0 )
96cdf0e10cSrcweir     {
97cdf0e10cSrcweir         s[i] = '\0';
98cdf0e10cSrcweir         return s;
99cdf0e10cSrcweir     }
100cdf0e10cSrcweir     else
101cdf0e10cSrcweir     {
102cdf0e10cSrcweir         return NULL;
103cdf0e10cSrcweir     }
104cdf0e10cSrcweir }
105cdf0e10cSrcweir 
106cdf0e10cSrcweir /*****************************************************************************
107cdf0e10cSrcweir  * compare function for binary search
108cdf0e10cSrcweir  *****************************************************************************/
109cdf0e10cSrcweir 
110cdf0e10cSrcweir static int
_pair_compare(const char * key,const _pair * pair)111cdf0e10cSrcweir _pair_compare (const char *key, const _pair *pair)
112cdf0e10cSrcweir {
113cdf0e10cSrcweir     int result = rtl_str_compareIgnoreAsciiCase( key, pair->key );
114cdf0e10cSrcweir     return result;
115cdf0e10cSrcweir }
116cdf0e10cSrcweir 
117cdf0e10cSrcweir /*****************************************************************************
118cdf0e10cSrcweir  * binary search on encoding tables
119cdf0e10cSrcweir  *****************************************************************************/
120cdf0e10cSrcweir 
121cdf0e10cSrcweir static const _pair*
_pair_search(const char * key,const _pair * base,unsigned int member)122cdf0e10cSrcweir _pair_search (const char *key, const _pair *base, unsigned int member )
123cdf0e10cSrcweir {
124cdf0e10cSrcweir     unsigned int lower = 0;
125cdf0e10cSrcweir     unsigned int upper = member;
126cdf0e10cSrcweir     unsigned int current;
127cdf0e10cSrcweir     int comparison;
128cdf0e10cSrcweir 
129cdf0e10cSrcweir     /* check for validity of input */
130cdf0e10cSrcweir     if ( (key == NULL) || (base == NULL) || (member == 0) )
131cdf0e10cSrcweir         return NULL;
132cdf0e10cSrcweir 
133cdf0e10cSrcweir     /* binary search */
134cdf0e10cSrcweir     while ( lower < upper )
135cdf0e10cSrcweir     {
136cdf0e10cSrcweir         current = (lower + upper) / 2;
137cdf0e10cSrcweir         comparison = _pair_compare( key, base + current );
138cdf0e10cSrcweir         if (comparison < 0)
139cdf0e10cSrcweir             upper = current;
140cdf0e10cSrcweir         else
141cdf0e10cSrcweir         if (comparison > 0)
142cdf0e10cSrcweir             lower = current + 1;
143cdf0e10cSrcweir         else
144cdf0e10cSrcweir             return base + current;
145cdf0e10cSrcweir     }
146cdf0e10cSrcweir 
147cdf0e10cSrcweir     return NULL;
148cdf0e10cSrcweir }
149cdf0e10cSrcweir 
150cdf0e10cSrcweir 
151cdf0e10cSrcweir /************************************************************************
152cdf0e10cSrcweir  * read_encoding_table
153cdf0e10cSrcweir  ************************************************************************/
154cdf0e10cSrcweir 
read_encoding_table(char * file,EncodingMap & aEncodingMap)155cdf0e10cSrcweir void read_encoding_table(char * file, EncodingMap& aEncodingMap)
156cdf0e10cSrcweir {
157cdf0e10cSrcweir     FILE * fp = fopen(file, "r");
158cdf0e10cSrcweir     if ( ! fp  ) {
159cdf0e10cSrcweir         fprintf(stderr, "ulfconv: %s %s\n", file, strerror(errno));
160cdf0e10cSrcweir         exit(2);
161cdf0e10cSrcweir     }
162cdf0e10cSrcweir 
163cdf0e10cSrcweir     char buffer[512];
164cdf0e10cSrcweir     while ( NULL != my_fgets(buffer, sizeof(buffer), fp) ) {
165cdf0e10cSrcweir 
166cdf0e10cSrcweir         // strip comment lines
167cdf0e10cSrcweir         if ( buffer[0] == '#' )
168cdf0e10cSrcweir             continue;
169cdf0e10cSrcweir 
170cdf0e10cSrcweir         // find end of language string
171cdf0e10cSrcweir         char * cp;
172cdf0e10cSrcweir         for ( cp = buffer; ! isspace(*cp); cp++ )
173cdf0e10cSrcweir             ;
174cdf0e10cSrcweir         *cp = '\0';
175cdf0e10cSrcweir 
176cdf0e10cSrcweir         // find start of codepage string
177cdf0e10cSrcweir         for ( ++cp; isspace(*cp); ++cp )
178cdf0e10cSrcweir             ;
179cdf0e10cSrcweir         char * codepage = cp;
180cdf0e10cSrcweir 
181cdf0e10cSrcweir         // find end of codepage string
182cdf0e10cSrcweir         for ( ++cp; ! isspace(*cp); ++cp )
183cdf0e10cSrcweir             ;
184cdf0e10cSrcweir         *cp = '\0';
185cdf0e10cSrcweir 
186cdf0e10cSrcweir         // find the correct mapping for codepage
187cdf0e10cSrcweir         const unsigned int members = sizeof( _ms_encoding_list ) / sizeof( _pair );
188cdf0e10cSrcweir         const _pair *encoding = _pair_search( codepage, _ms_encoding_list, members );
189cdf0e10cSrcweir 
190cdf0e10cSrcweir         if ( encoding != NULL ) {
191cdf0e10cSrcweir             const std::string language(buffer);
192cdf0e10cSrcweir             aEncodingMap.insert( EncodingMap::value_type(language, encoding->value) );
193cdf0e10cSrcweir         }
194cdf0e10cSrcweir     }
195cdf0e10cSrcweir 
196cdf0e10cSrcweir     fclose(fp);
197cdf0e10cSrcweir }
198cdf0e10cSrcweir 
199cdf0e10cSrcweir /************************************************************************
200cdf0e10cSrcweir  * print_legacy_mixed
201cdf0e10cSrcweir  ************************************************************************/
202cdf0e10cSrcweir 
print_legacy_mixed(FILE * ostream,const rtl::OUString & aString,const std::string & language,EncodingMap & aEncodingMap)203cdf0e10cSrcweir void print_legacy_mixed(
204cdf0e10cSrcweir     FILE * ostream,
205cdf0e10cSrcweir     const rtl::OUString& aString,
206cdf0e10cSrcweir     const std::string& language,
207cdf0e10cSrcweir     EncodingMap& aEncodingMap)
208cdf0e10cSrcweir {
209cdf0e10cSrcweir     EncodingMap::iterator iter = aEncodingMap.find(language);
210cdf0e10cSrcweir 
211cdf0e10cSrcweir     if ( iter != aEncodingMap.end() ) {
212cdf0e10cSrcweir         fputs(OUStringToOString(aString, iter->second).getStr(), ostream);
213cdf0e10cSrcweir     } else {
214cdf0e10cSrcweir         fprintf(stderr, "ulfconv: WARNING: no legacy encoding found for %s\n", language.c_str());
215cdf0e10cSrcweir     }
216cdf0e10cSrcweir }
217cdf0e10cSrcweir 
218cdf0e10cSrcweir /************************************************************************
219cdf0e10cSrcweir  * print_java_style
220cdf0e10cSrcweir  ************************************************************************/
221cdf0e10cSrcweir 
print_java_style(FILE * ostream,const rtl::OUString & aString)222cdf0e10cSrcweir void print_java_style(FILE * ostream, const rtl::OUString& aString)
223cdf0e10cSrcweir {
224cdf0e10cSrcweir     int imax = aString.getLength();
225cdf0e10cSrcweir     for (int i = 0; i < imax; i++) {
226cdf0e10cSrcweir         sal_Unicode uc = aString[i];
227cdf0e10cSrcweir         if ( uc < 128 ) {
228cdf0e10cSrcweir             fprintf(ostream, "%c", (char) uc);
229cdf0e10cSrcweir         } else {
230cdf0e10cSrcweir             fprintf(ostream, "\\u%2.2x%2.2x", uc >> 8, uc & 0xFF );
231cdf0e10cSrcweir         }
232cdf0e10cSrcweir     }
233cdf0e10cSrcweir }
234cdf0e10cSrcweir 
235cdf0e10cSrcweir /************************************************************************
236cdf0e10cSrcweir  * main
237cdf0e10cSrcweir  ************************************************************************/
238cdf0e10cSrcweir 
main(int argc,char * const argv[])239cdf0e10cSrcweir int main( int argc, char * const argv[] )
240cdf0e10cSrcweir {
241cdf0e10cSrcweir     EncodingMap aEncodingMap;
242cdf0e10cSrcweir 
243cdf0e10cSrcweir     FILE *istream = stdin;
244cdf0e10cSrcweir     FILE *ostream = stdout;
245cdf0e10cSrcweir 
246cdf0e10cSrcweir     char *outfile = NULL;
247cdf0e10cSrcweir 
248cdf0e10cSrcweir     int errflg = 0;
249cdf0e10cSrcweir     int argi;
250cdf0e10cSrcweir 
251cdf0e10cSrcweir     for( argi=1; argi < argc; argi++ )
252cdf0e10cSrcweir     {
253cdf0e10cSrcweir         if( argv[argi][0] == '-' && argv[argi][2] == '\0' )
254cdf0e10cSrcweir         {
255cdf0e10cSrcweir             switch(argv[argi][1]) {
256cdf0e10cSrcweir             case 'o':
257cdf0e10cSrcweir                 if (argi+1 >= argc || argv[argi+1][0] == '-')
258cdf0e10cSrcweir                 {
259cdf0e10cSrcweir                     fprintf(stderr, "Option -%c requires an operand\n", argv[argi][1]);
260cdf0e10cSrcweir                     errflg++;
261cdf0e10cSrcweir                     break;
262cdf0e10cSrcweir                 }
263cdf0e10cSrcweir 
264cdf0e10cSrcweir                 ++argi;
265cdf0e10cSrcweir                 outfile = argv[argi];
266cdf0e10cSrcweir                 break;
267cdf0e10cSrcweir             case 't':
268cdf0e10cSrcweir                 if (argi+1 >= argc || argv[argi+1][0] == '-')
269cdf0e10cSrcweir                 {
270cdf0e10cSrcweir                     fprintf(stderr, "Option -%c requires an operand\n", argv[argi][1]);
271cdf0e10cSrcweir                     errflg++;
272cdf0e10cSrcweir                     break;
273cdf0e10cSrcweir                 }
274cdf0e10cSrcweir 
275cdf0e10cSrcweir                 read_encoding_table(argv[++argi], aEncodingMap);
276cdf0e10cSrcweir                 break;
277cdf0e10cSrcweir             default:
278cdf0e10cSrcweir                 fprintf(stderr, "Unrecognized option: -%c\n", argv[argi][1]);
279cdf0e10cSrcweir                 errflg++;
280cdf0e10cSrcweir             }
281cdf0e10cSrcweir         }
282cdf0e10cSrcweir         else
283cdf0e10cSrcweir         {
284cdf0e10cSrcweir             break;
285cdf0e10cSrcweir         }
286cdf0e10cSrcweir     }
287cdf0e10cSrcweir 
288cdf0e10cSrcweir     if (errflg) {
289cdf0e10cSrcweir       fprintf(stderr, "Usage: ulfconv [-o <output file>] [-t <encoding table>] [<ulf file>]\n");
290cdf0e10cSrcweir       exit(2);
291cdf0e10cSrcweir     }
292cdf0e10cSrcweir 
293cdf0e10cSrcweir     /* assign input file to stdin */
294cdf0e10cSrcweir     if ( argi < argc )
295cdf0e10cSrcweir     {
296cdf0e10cSrcweir         istream = fopen(argv[argi], "r");
297cdf0e10cSrcweir         if ( istream  == NULL ) {
298cdf0e10cSrcweir             fprintf(stderr, "ulfconv: %s : %s\n", argv[argi], strerror(errno));
299cdf0e10cSrcweir             exit(2);
300cdf0e10cSrcweir         }
301cdf0e10cSrcweir     }
302cdf0e10cSrcweir 
303cdf0e10cSrcweir 	/* open output file if any */
304cdf0e10cSrcweir 	if ( outfile )
305cdf0e10cSrcweir 	{
306cdf0e10cSrcweir         ostream = fopen(outfile, "w");
307cdf0e10cSrcweir         if ( ostream == NULL ) {
308cdf0e10cSrcweir             fprintf(stderr, "ulfconv: %s : %s\n", outfile, strerror(errno));
309cdf0e10cSrcweir             fclose(istream);
310cdf0e10cSrcweir             exit(2);
311cdf0e10cSrcweir         }
312cdf0e10cSrcweir 	}
313cdf0e10cSrcweir 
314cdf0e10cSrcweir     /* read line by line from stdin */
315cdf0e10cSrcweir     char buffer[65536];
316cdf0e10cSrcweir     while ( NULL != fgets(buffer, sizeof(buffer), istream) ) {
317cdf0e10cSrcweir 
318cdf0e10cSrcweir         /* only handle lines containing " = " */
319cdf0e10cSrcweir         char * cp = strstr(buffer, " = \"");
320cdf0e10cSrcweir         if ( cp ) {
321cdf0e10cSrcweir             rtl::OUString aString;
322cdf0e10cSrcweir 
323cdf0e10cSrcweir             /* find end of lang string */
324cdf0e10cSrcweir             int n;
325cdf0e10cSrcweir             for ( n=0; ! isspace(buffer[n]); n++ )
326cdf0e10cSrcweir                 ;
327cdf0e10cSrcweir 
328cdf0e10cSrcweir             std::string line = buffer;
329cdf0e10cSrcweir             std::string lang(line, 0, n);
330cdf0e10cSrcweir 
331cdf0e10cSrcweir             cp += 4;
332cdf0e10cSrcweir             rtl_string2UString( &aString.pData, cp, strrchr(cp, '\"') - cp,
333cdf0e10cSrcweir                 RTL_TEXTENCODING_UTF8, OSTRING_TO_OUSTRING_CVTFLAGS );
334cdf0e10cSrcweir 
335cdf0e10cSrcweir             fprintf(ostream, "%s = \"", lang.c_str());
336cdf0e10cSrcweir 
337cdf0e10cSrcweir             if ( aEncodingMap.empty() ) {
338cdf0e10cSrcweir                 print_java_style(ostream, aString);
339cdf0e10cSrcweir             } else {
340cdf0e10cSrcweir                 print_legacy_mixed(ostream, aString, lang, aEncodingMap);
341cdf0e10cSrcweir             }
342cdf0e10cSrcweir 
343cdf0e10cSrcweir             fprintf(ostream, "\"\n");
344cdf0e10cSrcweir 
345cdf0e10cSrcweir 
346cdf0e10cSrcweir         } else {
347cdf0e10cSrcweir             fputs(buffer, ostream);
348cdf0e10cSrcweir         }
349cdf0e10cSrcweir     }
350cdf0e10cSrcweir 
351cdf0e10cSrcweir     fclose(ostream);
352cdf0e10cSrcweir     fclose(istream);
353cdf0e10cSrcweir }
354