xref: /AOO41X/main/ucb/source/regexp/regexp.cxx (revision 2f86921c33504fdff5a030df6c0b258927045abb) !
1 /**************************************************************
2  *
3  * Licensed to the Apache Software Foundation (ASF) under one
4  * or more contributor license agreements.  See the NOTICE file
5  * distributed with this work for additional information
6  * regarding copyright ownership.  The ASF licenses this file
7  * to you under the Apache License, Version 2.0 (the
8  * "License"); you may not use this file except in compliance
9  * with the License.  You may obtain a copy of the License at
10  *
11  *   http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing,
14  * software distributed under the License is distributed on an
15  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16  * KIND, either express or implied.  See the License for the
17  * specific language governing permissions and limitations
18  * under the License.
19  *
20  *************************************************************/
21 
22 
23 
24 // MARKER(update_precomp.py): autogen include statement, do not remove
25 #include "precompiled_ucb.hxx"
26 #include <regexp.hxx>
27 
28 #include <cstddef>
29 
30 #include "osl/diagnose.h"
31 #include <com/sun/star/lang/IllegalArgumentException.hpp>
32 #include <rtl/ustrbuf.hxx>
33 #include <rtl/ustring.hxx>
34 
35 namespace unnamed_ucb_regexp {} using namespace unnamed_ucb_regexp;
36     // unnamed namespaces don't work well yet...
37 
38 using namespace com::sun::star;
39 using namespace ucb_impl;
40 
41 //============================================================================
42 //
43 //  Regexp
44 //
45 //============================================================================
46 
Regexp(Kind eTheKind,rtl::OUString const & rThePrefix,bool bTheEmptyDomain,rtl::OUString const & rTheInfix,bool bTheTranslation,rtl::OUString const & rTheReversePrefix)47 inline Regexp::Regexp(Kind eTheKind, rtl::OUString const & rThePrefix,
48                       bool bTheEmptyDomain, rtl::OUString const & rTheInfix,
49                       bool bTheTranslation,
50                       rtl::OUString const & rTheReversePrefix):
51     m_eKind(eTheKind),
52     m_aPrefix(rThePrefix),
53     m_aInfix(rTheInfix),
54     m_aReversePrefix(rTheReversePrefix),
55     m_bEmptyDomain(bTheEmptyDomain),
56     m_bTranslation(bTheTranslation)
57 {
58     OSL_ASSERT(m_eKind == KIND_DOMAIN
59                || !m_bEmptyDomain && m_aInfix.getLength() == 0);
60     OSL_ASSERT(m_bTranslation || m_aReversePrefix.getLength() == 0);
61 }
62 
63 //============================================================================
64 namespace unnamed_ucb_regexp {
65 
matchStringIgnoreCase(sal_Unicode const ** pBegin,sal_Unicode const * pEnd,rtl::OUString const & rString)66 bool matchStringIgnoreCase(sal_Unicode const ** pBegin,
67                            sal_Unicode const * pEnd,
68                            rtl::OUString const & rString)
69 {
70     sal_Unicode const * p = *pBegin;
71 
72     sal_Unicode const * q = rString.getStr();
73     sal_Unicode const * qEnd = q + rString.getLength();
74 
75     if (pEnd - p < qEnd - q)
76         return false;
77 
78     while (q != qEnd)
79     {
80         sal_Unicode c1 = *p++;
81         sal_Unicode c2 = *q++;
82         if (c1 >= 'a' && c1 <= 'z')
83             c1 -= 'a' - 'A';
84         if (c2 >= 'a' && c2 <= 'z')
85             c2 -= 'a' - 'A';
86         if (c1 != c2)
87             return false;
88     }
89 
90     *pBegin = p;
91     return true;
92 }
93 
94 }
95 
matches(rtl::OUString const & rString,rtl::OUString * pTranslation,bool * pTranslated) const96 bool Regexp::matches(rtl::OUString const & rString,
97                      rtl::OUString * pTranslation, bool * pTranslated) const
98 {
99     sal_Unicode const * pBegin = rString.getStr();
100     sal_Unicode const * pEnd = pBegin + rString.getLength();
101 
102     bool bMatches = false;
103 
104     sal_Unicode const * p = pBegin;
105     if (matchStringIgnoreCase(&p, pEnd, m_aPrefix))
106     {
107         sal_Unicode const * pBlock1Begin = p;
108         sal_Unicode const * pBlock1End = pEnd;
109 
110         sal_Unicode const * pBlock2Begin = 0;
111         sal_Unicode const * pBlock2End = 0;
112 
113         switch (m_eKind)
114         {
115             case KIND_PREFIX:
116                 bMatches = true;
117                 break;
118 
119             case KIND_AUTHORITY:
120                 bMatches = p == pEnd || *p == '/' || *p == '?' || *p == '#';
121                 break;
122 
123             case KIND_DOMAIN:
124                 if (!m_bEmptyDomain)
125                 {
126                     if (p == pEnd || *p == '/' || *p == '?' || *p == '#')
127                         break;
128                     ++p;
129                 }
130                 for (;;)
131                 {
132                     sal_Unicode const * q = p;
133                     if (matchStringIgnoreCase(&q, pEnd, m_aInfix)
134                         && (q == pEnd || *q == '/' || *q == '?' || *q == '#'))
135                     {
136                         bMatches = true;
137                         pBlock1End = p;
138                         pBlock2Begin = q;
139                         pBlock2End = pEnd;
140                         break;
141                     }
142 
143                     if (p == pEnd)
144                         break;
145 
146                     sal_Unicode c = *p++;
147                     if (c == '/' || c == '?' || c == '#')
148                         break;
149                 }
150                 break;
151         }
152 
153         if (bMatches)
154         {
155             if (m_bTranslation)
156             {
157                 if (pTranslation)
158                 {
159                     rtl::OUStringBuffer aBuffer(m_aReversePrefix);
160                     aBuffer.append(pBlock1Begin, pBlock1End - pBlock1Begin);
161                     aBuffer.append(m_aInfix);
162                     aBuffer.append(pBlock2Begin, pBlock2End - pBlock2Begin);
163                     *pTranslation = aBuffer.makeStringAndClear();
164                 }
165                 if (pTranslated)
166                     *pTranslated = true;
167             }
168             else
169             {
170                 if (pTranslation)
171                     *pTranslation = rString;
172                 if (pTranslated)
173                     *pTranslated = false;
174             }
175         }
176     }
177 
178     return bMatches;
179 }
180 
181 //============================================================================
182 namespace unnamed_ucb_regexp {
183 
isAlpha(sal_Unicode c)184 inline bool isAlpha(sal_Unicode c)
185 {
186     return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
187 }
188 
isDigit(sal_Unicode c)189 inline bool isDigit(sal_Unicode c)
190 {
191     return c >= '0' && c <= '9';
192 }
193 
isScheme(rtl::OUString const & rString,bool bColon)194 bool isScheme(rtl::OUString const & rString, bool bColon)
195 {
196     // Return true if rString matches <scheme> (plus a trailing ":" if bColon
197     // is true) from RFC 2396:
198     sal_Unicode const * p = rString.getStr();
199     sal_Unicode const * pEnd = p + rString.getLength();
200     if (p != pEnd && isAlpha(*p))
201         for (++p;;)
202         {
203             if (p == pEnd)
204                 return !bColon;
205             sal_Unicode c = *p++;
206             if (!(isAlpha(c) || isDigit(c)
207                   || c == '+' || c == '-' || c == '.'))
208                 return bColon && c == ':' && p == pEnd;
209         }
210     return false;
211 }
212 
appendStringLiteral(rtl::OUStringBuffer * pBuffer,rtl::OUString const & rString)213 void appendStringLiteral(rtl::OUStringBuffer * pBuffer,
214                          rtl::OUString const & rString)
215 {
216     OSL_ASSERT(pBuffer);
217 
218     pBuffer->append(sal_Unicode('"'));
219     sal_Unicode const * p = rString.getStr();
220     sal_Unicode const * pEnd = p + rString.getLength();
221     while (p != pEnd)
222     {
223         sal_Unicode c = *p++;
224         if (c == '"' || c == '\\')
225             pBuffer->append(sal_Unicode('\\'));
226         pBuffer->append(c);
227     }
228     pBuffer->append(sal_Unicode('"'));
229 }
230 
231 }
232 
getRegexp(bool bReverse) const233 rtl::OUString Regexp::getRegexp(bool bReverse) const
234 {
235     if (m_bTranslation)
236     {
237         rtl::OUStringBuffer aBuffer;
238         if (bReverse)
239         {
240             if (m_aReversePrefix.getLength() != 0)
241                 appendStringLiteral(&aBuffer, m_aReversePrefix);
242         }
243         else
244         {
245             if (m_aPrefix.getLength() != 0)
246                 appendStringLiteral(&aBuffer, m_aPrefix);
247         }
248         switch (m_eKind)
249         {
250             case KIND_PREFIX:
251                 aBuffer.appendAscii(RTL_CONSTASCII_STRINGPARAM("(.*)"));
252                 break;
253 
254             case KIND_AUTHORITY:
255                 aBuffer.
256                     appendAscii(RTL_CONSTASCII_STRINGPARAM("(([/?#].*)?)"));
257                 break;
258 
259             case KIND_DOMAIN:
260                 aBuffer.appendAscii(RTL_CONSTASCII_STRINGPARAM("([^/?#]"));
261                 aBuffer.append(sal_Unicode(m_bEmptyDomain ? '*' : '+'));
262                 if (m_aInfix.getLength() != 0)
263                     appendStringLiteral(&aBuffer, m_aInfix);
264                 aBuffer.
265                     appendAscii(RTL_CONSTASCII_STRINGPARAM("([/?#].*)?)"));
266                 break;
267         }
268         aBuffer.appendAscii(RTL_CONSTASCII_STRINGPARAM("->"));
269         if (bReverse)
270         {
271             if (m_aPrefix.getLength() != 0)
272                 appendStringLiteral(&aBuffer, m_aPrefix);
273         }
274         else
275         {
276             if (m_aReversePrefix.getLength() != 0)
277                 appendStringLiteral(&aBuffer, m_aReversePrefix);
278         }
279         aBuffer.appendAscii(RTL_CONSTASCII_STRINGPARAM("\\1"));
280         return aBuffer.makeStringAndClear();
281     }
282     else if (m_eKind == KIND_PREFIX && isScheme(m_aPrefix, true))
283         return m_aPrefix.copy(0, m_aPrefix.getLength() - 1);
284     else
285     {
286         rtl::OUStringBuffer aBuffer;
287         if (m_aPrefix.getLength() != 0)
288             appendStringLiteral(&aBuffer, m_aPrefix);
289         switch (m_eKind)
290         {
291             case KIND_PREFIX:
292                 aBuffer.appendAscii(RTL_CONSTASCII_STRINGPARAM(".*"));
293                 break;
294 
295             case KIND_AUTHORITY:
296                 aBuffer.appendAscii(RTL_CONSTASCII_STRINGPARAM("([/?#].*)?"));
297                 break;
298 
299             case KIND_DOMAIN:
300                 aBuffer.appendAscii(RTL_CONSTASCII_STRINGPARAM("[^/?#]"));
301                 aBuffer.append(sal_Unicode(m_bEmptyDomain ? '*' : '+'));
302                 if (m_aInfix.getLength() != 0)
303                     appendStringLiteral(&aBuffer, m_aInfix);
304                 aBuffer.appendAscii(RTL_CONSTASCII_STRINGPARAM("([/?#].*)?"));
305                 break;
306         }
307         return aBuffer.makeStringAndClear();
308     }
309 }
310 
311 //============================================================================
312 namespace unnamed_ucb_regexp {
313 
matchString(sal_Unicode const ** pBegin,sal_Unicode const * pEnd,sal_Char const * pString,size_t nStringLength)314 bool matchString(sal_Unicode const ** pBegin, sal_Unicode const * pEnd,
315                  sal_Char const * pString, size_t nStringLength)
316 {
317     sal_Unicode const * p = *pBegin;
318 
319     sal_uChar const * q = reinterpret_cast< sal_uChar const * >(pString);
320     sal_uChar const * qEnd = q + nStringLength;
321 
322     if (pEnd - p < qEnd - q)
323         return false;
324 
325     while (q != qEnd)
326     {
327         sal_Unicode c1 = *p++;
328         sal_Unicode c2 = *q++;
329         if (c1 != c2)
330             return false;
331     }
332 
333     *pBegin = p;
334     return true;
335 }
336 
scanStringLiteral(sal_Unicode const ** pBegin,sal_Unicode const * pEnd,rtl::OUString * pString)337 bool scanStringLiteral(sal_Unicode const ** pBegin, sal_Unicode const * pEnd,
338                        rtl::OUString * pString)
339 {
340     sal_Unicode const * p = *pBegin;
341 
342     if (p == pEnd || *p++ != '"')
343         return false;
344 
345     rtl::OUStringBuffer aBuffer;
346     for (;;)
347     {
348         if (p == pEnd)
349             return false;
350         sal_Unicode c = *p++;
351         if (c == '"')
352             break;
353         if (c == '\\')
354         {
355             if (p == pEnd)
356                 return false;
357             c = *p++;
358             if (c != '"' && c != '\\')
359                 return false;
360         }
361         aBuffer.append(c);
362     }
363 
364     *pBegin = p;
365     *pString = aBuffer.makeStringAndClear();
366     return true;
367 }
368 
369 }
370 
parse(rtl::OUString const & rRegexp)371 Regexp Regexp::parse(rtl::OUString const & rRegexp)
372 {
373     // Detect an input of '<scheme>' as an abbreviation of '"<scheme>:".*'
374     // where <scheme> is as defined in RFC 2396:
375     if (isScheme(rRegexp, false))
376         return Regexp(Regexp::KIND_PREFIX,
377                       rRegexp
378                           + rtl::OUString(RTL_CONSTASCII_USTRINGPARAM(":")),
379                       false,
380                       rtl::OUString(),
381                       false,
382                       rtl::OUString());
383 
384     sal_Unicode const * p = rRegexp.getStr();
385     sal_Unicode const * pEnd = p + rRegexp.getLength();
386 
387     rtl::OUString aPrefix;
388     scanStringLiteral(&p, pEnd, &aPrefix);
389 
390     if (p == pEnd)
391         throw lang::IllegalArgumentException();
392 
393     if (matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM(".*")))
394     {
395         if (p != pEnd)
396             throw lang::IllegalArgumentException();
397 
398         return Regexp(Regexp::KIND_PREFIX, aPrefix, false, rtl::OUString(),
399                       false, rtl::OUString());
400     }
401     else if (matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("(.*)->")))
402     {
403         rtl::OUString aReversePrefix;
404         scanStringLiteral(&p, pEnd, &aReversePrefix);
405 
406         if (!matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("\\1"))
407             || p != pEnd)
408             throw lang::IllegalArgumentException();
409 
410         return Regexp(Regexp::KIND_PREFIX, aPrefix, false, rtl::OUString(),
411                       true, aReversePrefix);
412     }
413     else if (matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("([/?#].*)?")))
414     {
415         if (p != pEnd)
416             throw lang::IllegalArgumentException();
417 
418         return Regexp(Regexp::KIND_AUTHORITY, aPrefix, false, rtl::OUString(),
419                       false, rtl::OUString());
420     }
421     else if (matchString(&p, pEnd,
422                          RTL_CONSTASCII_STRINGPARAM("(([/?#].*)?)->")))
423     {
424         rtl::OUString aReversePrefix;
425         if (!(scanStringLiteral(&p, pEnd, &aReversePrefix)
426               && matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("\\1"))
427               && p == pEnd))
428             throw lang::IllegalArgumentException();
429 
430         return Regexp(Regexp::KIND_AUTHORITY, aPrefix, false, rtl::OUString(),
431                       true, aReversePrefix);
432     }
433     else
434     {
435         bool bOpen = false;
436         if (p != pEnd && *p == '(')
437         {
438             ++p;
439             bOpen = true;
440         }
441 
442         if (!matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("[^/?#]")))
443             throw lang::IllegalArgumentException();
444 
445         if (p == pEnd || (*p != '*' && *p != '+'))
446             throw lang::IllegalArgumentException();
447         bool bEmptyDomain = *p++ == '*';
448 
449         rtl::OUString aInfix;
450         scanStringLiteral(&p, pEnd, &aInfix);
451 
452         if (!matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("([/?#].*)?")))
453             throw lang::IllegalArgumentException();
454 
455         rtl::OUString aReversePrefix;
456         if (bOpen
457             && !(matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM(")->"))
458                  && scanStringLiteral(&p, pEnd, &aReversePrefix)
459                  && matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("\\1"))))
460             throw lang::IllegalArgumentException();
461 
462         if (p != pEnd)
463             throw lang::IllegalArgumentException();
464 
465         return Regexp(Regexp::KIND_DOMAIN, aPrefix, bEmptyDomain, aInfix,
466                       bOpen, aReversePrefix);
467     }
468 }
469 
470