xref: /AOO41X/main/sdext/source/pdfimport/pdfparse/pdfparse.cxx (revision c142477ce2bdb32de904d9995ebf0add7aef9609)
1*c142477cSAndrew Rist /**************************************************************
2cdf0e10cSrcweir  *
3*c142477cSAndrew Rist  * Licensed to the Apache Software Foundation (ASF) under one
4*c142477cSAndrew Rist  * or more contributor license agreements.  See the NOTICE file
5*c142477cSAndrew Rist  * distributed with this work for additional information
6*c142477cSAndrew Rist  * regarding copyright ownership.  The ASF licenses this file
7*c142477cSAndrew Rist  * to you under the Apache License, Version 2.0 (the
8*c142477cSAndrew Rist  * "License"); you may not use this file except in compliance
9*c142477cSAndrew Rist  * with the License.  You may obtain a copy of the License at
10cdf0e10cSrcweir  *
11*c142477cSAndrew Rist  *   http://www.apache.org/licenses/LICENSE-2.0
12cdf0e10cSrcweir  *
13*c142477cSAndrew Rist  * Unless required by applicable law or agreed to in writing,
14*c142477cSAndrew Rist  * software distributed under the License is distributed on an
15*c142477cSAndrew Rist  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16*c142477cSAndrew Rist  * KIND, either express or implied.  See the License for the
17*c142477cSAndrew Rist  * specific language governing permissions and limitations
18*c142477cSAndrew Rist  * under the License.
19cdf0e10cSrcweir  *
20*c142477cSAndrew Rist  *************************************************************/
21*c142477cSAndrew Rist 
22*c142477cSAndrew Rist 
23cdf0e10cSrcweir 
24cdf0e10cSrcweir // MARKER(update_precomp.py): autogen include statement, do not remove
25cdf0e10cSrcweir #include "precompiled_sdext.hxx"
26cdf0e10cSrcweir 
27cdf0e10cSrcweir #if defined __SUNPRO_CC
28cdf0e10cSrcweir #pragma disable_warn
29cdf0e10cSrcweir #elif defined _MSC_VER
30cdf0e10cSrcweir #pragma warning(push, 1)
31cdf0e10cSrcweir #endif
32cdf0e10cSrcweir 
33cdf0e10cSrcweir #include "pdfparse.hxx"
34cdf0e10cSrcweir 
35cdf0e10cSrcweir // workaround windows compiler: do not include multi_pass.hpp
36cdf0e10cSrcweir //#include <boost/spirit.hpp>
37cdf0e10cSrcweir #include <boost/spirit/include/classic_core.hpp>
38cdf0e10cSrcweir #include <boost/spirit/include/classic_utility.hpp>
39cdf0e10cSrcweir #include <boost/spirit/include/classic_error_handling.hpp>
40cdf0e10cSrcweir #include <boost/spirit/include/classic_file_iterator.hpp>
41cdf0e10cSrcweir #include <boost/bind.hpp>
42cdf0e10cSrcweir #include <string>
43cdf0e10cSrcweir 
44cdf0e10cSrcweir #include <rtl/strbuf.hxx>
45cdf0e10cSrcweir #include <rtl/memory.h>
46cdf0e10cSrcweir #include <rtl/alloc.h>
47cdf0e10cSrcweir 
48cdf0e10cSrcweir // disable warnings again because someone along the line has enabled them
49cdf0e10cSrcweir #if defined __SUNPRO_CC
50cdf0e10cSrcweir #pragma disable_warn
51cdf0e10cSrcweir #elif defined _MSC_VER
52cdf0e10cSrcweir #pragma warning(push, 1)
53cdf0e10cSrcweir #endif
54cdf0e10cSrcweir 
55cdf0e10cSrcweir using namespace boost::spirit;
56cdf0e10cSrcweir using namespace rtl;
57cdf0e10cSrcweir using namespace pdfparse;
58cdf0e10cSrcweir 
59cdf0e10cSrcweir class StringEmitContext : public EmitContext
60cdf0e10cSrcweir {
61cdf0e10cSrcweir     OStringBuffer m_aBuf;
62cdf0e10cSrcweir     public:
StringEmitContext()63cdf0e10cSrcweir     StringEmitContext() : EmitContext(), m_aBuf(256) {}
~StringEmitContext()64cdf0e10cSrcweir     virtual ~StringEmitContext() {}
write(const void * pBuf,unsigned int nLen)65cdf0e10cSrcweir     virtual bool write( const void* pBuf, unsigned int nLen ) throw()
66cdf0e10cSrcweir     {
67cdf0e10cSrcweir         m_aBuf.append( (const sal_Char*)pBuf, nLen );
68cdf0e10cSrcweir         return true;
69cdf0e10cSrcweir     }
getCurPos()70cdf0e10cSrcweir     virtual unsigned int getCurPos() throw() { return m_aBuf.getLength(); }
copyOrigBytes(unsigned int nOrigOffset,unsigned int nLen)71cdf0e10cSrcweir     virtual bool copyOrigBytes( unsigned int nOrigOffset, unsigned int nLen ) throw()
72cdf0e10cSrcweir     { return (nOrigOffset+nLen < static_cast<unsigned int>(m_aBuf.getLength()) ) ?
73cdf0e10cSrcweir              write( m_aBuf.getStr() + nOrigOffset, nLen ) : false; }
readOrigBytes(unsigned int nOrigOffset,unsigned int nLen,void * pBuf)74cdf0e10cSrcweir     virtual unsigned int readOrigBytes( unsigned int nOrigOffset, unsigned int nLen, void* pBuf ) throw()
75cdf0e10cSrcweir     {
76cdf0e10cSrcweir         if( nOrigOffset+nLen < static_cast<unsigned int>(m_aBuf.getLength()) )
77cdf0e10cSrcweir         {
78cdf0e10cSrcweir             rtl_copyMemory( pBuf, m_aBuf.getStr()+nOrigOffset, nLen );
79cdf0e10cSrcweir             return nLen;
80cdf0e10cSrcweir         }
81cdf0e10cSrcweir         return 0;
82cdf0e10cSrcweir     }
83cdf0e10cSrcweir 
getString()84cdf0e10cSrcweir     OString getString() { return m_aBuf.makeStringAndClear(); }
85cdf0e10cSrcweir };
86cdf0e10cSrcweir 
87cdf0e10cSrcweir template< class iteratorT >
88cdf0e10cSrcweir class PDFGrammar :  public grammar< PDFGrammar<iteratorT> >
89cdf0e10cSrcweir {
90cdf0e10cSrcweir public:
91cdf0e10cSrcweir 
PDFGrammar(const iteratorT & first)92cdf0e10cSrcweir     PDFGrammar( const iteratorT& first )
93cdf0e10cSrcweir     : m_fDouble( 0.0 ), m_aGlobalBegin( first ) {}
~PDFGrammar()94cdf0e10cSrcweir     ~PDFGrammar()
95cdf0e10cSrcweir     {
96cdf0e10cSrcweir         if( !m_aObjectStack.empty() )
97cdf0e10cSrcweir             delete m_aObjectStack.front();
98cdf0e10cSrcweir     }
99cdf0e10cSrcweir 
100cdf0e10cSrcweir     double m_fDouble;
101cdf0e10cSrcweir     std::vector< unsigned int > m_aUIntStack;
102cdf0e10cSrcweir     std::vector< PDFEntry* >    m_aObjectStack;
103cdf0e10cSrcweir     rtl::OString                m_aErrorString;
104cdf0e10cSrcweir     iteratorT                   m_aGlobalBegin;
105cdf0e10cSrcweir 
106cdf0e10cSrcweir public:
107cdf0e10cSrcweir     struct pdf_string_parser
108cdf0e10cSrcweir     {
109cdf0e10cSrcweir         typedef nil_t result_t;
110cdf0e10cSrcweir         template <typename ScannerT>
111cdf0e10cSrcweir         std::ptrdiff_t
operator ()PDFGrammar::pdf_string_parser112cdf0e10cSrcweir         operator()(ScannerT const& scan, result_t& result) const
113cdf0e10cSrcweir         {
114cdf0e10cSrcweir             std::ptrdiff_t len = 0;
115cdf0e10cSrcweir 
116cdf0e10cSrcweir             int nBraceLevel = 0;
117cdf0e10cSrcweir             while( ! scan.at_end() )
118cdf0e10cSrcweir             {
119cdf0e10cSrcweir                 char c = *scan;
120cdf0e10cSrcweir                 if( c == ')' )
121cdf0e10cSrcweir                 {
122cdf0e10cSrcweir                     nBraceLevel--;
123cdf0e10cSrcweir                     if( nBraceLevel < 0 )
124cdf0e10cSrcweir                         break;
125cdf0e10cSrcweir                 }
126cdf0e10cSrcweir                 else if( c == '(' )
127cdf0e10cSrcweir                     nBraceLevel++;
128cdf0e10cSrcweir                 else if( c == '\\' ) // ignore escaped braces
129cdf0e10cSrcweir                 {
130cdf0e10cSrcweir                     ++len;
131cdf0e10cSrcweir                     ++scan;
132cdf0e10cSrcweir                     if( scan.at_end() )
133cdf0e10cSrcweir                         break;
134cdf0e10cSrcweir                 }
135cdf0e10cSrcweir                 ++len;
136cdf0e10cSrcweir                 ++scan;
137cdf0e10cSrcweir             }
138cdf0e10cSrcweir             return scan.at_end() ? -1 : len;
139cdf0e10cSrcweir         }
140cdf0e10cSrcweir     };
141cdf0e10cSrcweir 
142cdf0e10cSrcweir     template< typename ScannerT >
143cdf0e10cSrcweir     struct definition
144cdf0e10cSrcweir     {
definitionPDFGrammar::definition145cdf0e10cSrcweir         definition( const PDFGrammar<iteratorT>& rSelf )
146cdf0e10cSrcweir         {
147cdf0e10cSrcweir             PDFGrammar<iteratorT>* pSelf = const_cast< PDFGrammar<iteratorT>* >( &rSelf );
148cdf0e10cSrcweir 
149cdf0e10cSrcweir             // workaround workshop compiler: comment_p doesn't work
150cdf0e10cSrcweir             // comment     = comment_p("%")[boost::bind(&PDFGrammar::pushComment, pSelf, _1, _2 )];
151cdf0e10cSrcweir             comment     = lexeme_d[ (ch_p('%') >> *(~ch_p('\r') & ~ch_p('\n')) >> eol_p)[boost::bind(&PDFGrammar::pushComment, pSelf, _1, _2 )] ];
152cdf0e10cSrcweir 
153cdf0e10cSrcweir             boolean     = (str_p("true") | str_p("false"))[boost::bind(&PDFGrammar::pushBool, pSelf, _1, _2)];
154cdf0e10cSrcweir 
155cdf0e10cSrcweir             // workaround workshop compiler: confix_p doesn't work
156cdf0e10cSrcweir             //stream      = confix_p( "stream", *anychar_p, "endstream" )[boost::bind(&PDFGrammar::emitStream, pSelf, _1, _2 )];
157cdf0e10cSrcweir             stream      = (str_p("stream") >> *(anychar_p - str_p("endstream")) >> str_p("endstream"))[boost::bind(&PDFGrammar::emitStream, pSelf, _1, _2 )];
158cdf0e10cSrcweir 
159cdf0e10cSrcweir             name        = lexeme_d[
160cdf0e10cSrcweir                             ch_p('/')
161cdf0e10cSrcweir                             >> (*(anychar_p-chset_p("\t\n\f\r ()<>[]{}/%")-ch_p('\0')))
162cdf0e10cSrcweir                                [boost::bind(&PDFGrammar::pushName, pSelf, _1, _2)] ];
163cdf0e10cSrcweir 
164cdf0e10cSrcweir             // workaround workshop compiler: confix_p doesn't work
165cdf0e10cSrcweir             //stringtype  = ( confix_p("(",*anychar_p, ")") |
166cdf0e10cSrcweir             //                confix_p("<",*xdigit_p,  ">") )
167cdf0e10cSrcweir             //              [boost::bind(&PDFGrammar::pushString,pSelf, _1, _2)];
168cdf0e10cSrcweir 
169cdf0e10cSrcweir             stringtype  = ( ( ch_p('(') >> functor_parser<pdf_string_parser>() >> ch_p(')') ) |
170cdf0e10cSrcweir                             ( ch_p('<') >> *xdigit_p >> ch_p('>') ) )
171cdf0e10cSrcweir                           [boost::bind(&PDFGrammar::pushString,pSelf, _1, _2)];
172cdf0e10cSrcweir 
173cdf0e10cSrcweir             null_object = str_p( "null" )[boost::bind(&PDFGrammar::pushNull, pSelf, _1, _2)];
174cdf0e10cSrcweir 
175cdf0e10cSrcweir             #ifdef USE_ASSIGN_ACTOR
176cdf0e10cSrcweir             objectref   = ( uint_p[push_back_a(pSelf->m_aUIntStack)]
177cdf0e10cSrcweir                             >> uint_p[push_back_a(pSelf->m_aUIntStack)]
178cdf0e10cSrcweir                             >> ch_p('R')
179cdf0e10cSrcweir                             >> eps_p
180cdf0e10cSrcweir                           )[boost::bind(&PDFGrammar::pushObjectRef, pSelf, _1, _2)];
181cdf0e10cSrcweir             #else
182cdf0e10cSrcweir             objectref   = ( uint_p[boost::bind(&PDFGrammar::push_back_action_uint, pSelf, _1)]
183cdf0e10cSrcweir                             >> uint_p[boost::bind(&PDFGrammar::push_back_action_uint, pSelf, _1)]
184cdf0e10cSrcweir                             >> ch_p('R')
185cdf0e10cSrcweir                             >> eps_p
186cdf0e10cSrcweir                           )[boost::bind(&PDFGrammar::pushObjectRef, pSelf, _1, _2)];
187cdf0e10cSrcweir             #endif
188cdf0e10cSrcweir 
189cdf0e10cSrcweir             #ifdef USE_ASSIGN_ACTOR
190cdf0e10cSrcweir             simple_type = objectref | name |
191cdf0e10cSrcweir                           ( real_p[assign_a(pSelf->m_fDouble)] >> eps_p )
192cdf0e10cSrcweir                           [boost::bind(&PDFGrammar::pushDouble, pSelf, _1, _2)]
193cdf0e10cSrcweir                           | stringtype | boolean | null_object;
194cdf0e10cSrcweir             #else
195cdf0e10cSrcweir             simple_type = objectref | name |
196cdf0e10cSrcweir                           ( real_p[boost::bind(&PDFGrammar::assign_action_double, pSelf, _1)] >> eps_p )
197cdf0e10cSrcweir                           [boost::bind(&PDFGrammar::pushDouble, pSelf, _1, _2)]
198cdf0e10cSrcweir                           | stringtype | boolean | null_object;
199cdf0e10cSrcweir             #endif
200cdf0e10cSrcweir 
201cdf0e10cSrcweir             dict_begin  = str_p( "<<" )[boost::bind(&PDFGrammar::beginDict, pSelf, _1, _2)];
202cdf0e10cSrcweir             dict_end    = str_p( ">>" )[boost::bind(&PDFGrammar::endDict, pSelf, _1, _2)];
203cdf0e10cSrcweir 
204cdf0e10cSrcweir             array_begin = str_p("[")[boost::bind(&PDFGrammar::beginArray,pSelf, _1, _2)];
205cdf0e10cSrcweir             array_end   = str_p("]")[boost::bind(&PDFGrammar::endArray,pSelf, _1, _2)];
206cdf0e10cSrcweir 
207cdf0e10cSrcweir             #ifdef USE_ASSIGN_ACTOR
208cdf0e10cSrcweir             object_begin= uint_p[push_back_a(pSelf->m_aUIntStack)]
209cdf0e10cSrcweir                           >> uint_p[push_back_a(pSelf->m_aUIntStack)]
210cdf0e10cSrcweir                           >> str_p("obj" )[boost::bind(&PDFGrammar::beginObject, pSelf, _1, _2)];
211cdf0e10cSrcweir             #else
212cdf0e10cSrcweir             object_begin= uint_p[boost::bind(&PDFGrammar::push_back_action_uint, pSelf, _1)]
213cdf0e10cSrcweir                           >> uint_p[boost::bind(&PDFGrammar::push_back_action_uint, pSelf, _1)]
214cdf0e10cSrcweir                           >> str_p("obj" )[boost::bind(&PDFGrammar::beginObject, pSelf, _1, _2)];
215cdf0e10cSrcweir             #endif
216cdf0e10cSrcweir             object_end  = str_p( "endobj" )[boost::bind(&PDFGrammar::endObject, pSelf, _1, _2)];
217cdf0e10cSrcweir 
218cdf0e10cSrcweir             xref        = str_p( "xref" ) >> uint_p >> uint_p
219cdf0e10cSrcweir                           >> lexeme_d[
220cdf0e10cSrcweir                                 +( repeat_p(10)[digit_p]
221cdf0e10cSrcweir                                    >> blank_p
222cdf0e10cSrcweir                                    >> repeat_p(5)[digit_p]
223cdf0e10cSrcweir                                    >> blank_p
224cdf0e10cSrcweir                                    >> ( ch_p('n') | ch_p('f') )
225cdf0e10cSrcweir                                    >> repeat_p(2)[space_p]
226cdf0e10cSrcweir                                  ) ];
227cdf0e10cSrcweir 
228cdf0e10cSrcweir             dict_element= dict_begin | comment | simple_type
229cdf0e10cSrcweir                           | array_begin | array_end | dict_end;
230cdf0e10cSrcweir 
231cdf0e10cSrcweir             object      = object_begin
232cdf0e10cSrcweir                           >> *dict_element
233cdf0e10cSrcweir                           >> !stream
234cdf0e10cSrcweir                           >> object_end;
235cdf0e10cSrcweir 
236cdf0e10cSrcweir             trailer     = str_p( "trailer" )[boost::bind(&PDFGrammar::beginTrailer,pSelf,_1,_2)]
237cdf0e10cSrcweir                           >> *dict_element
238cdf0e10cSrcweir                           >> str_p("startxref")
239cdf0e10cSrcweir                           >> uint_p
240cdf0e10cSrcweir                           >> str_p("%%EOF")[boost::bind(&PDFGrammar::endTrailer,pSelf,_1,_2)];
241cdf0e10cSrcweir 
242cdf0e10cSrcweir             #ifdef USE_ASSIGN_ACTOR
243cdf0e10cSrcweir             pdfrule     = ! (lexeme_d[
244cdf0e10cSrcweir                                 str_p( "%PDF-" )
245cdf0e10cSrcweir                                 >> uint_p[push_back_a(pSelf->m_aUIntStack)]
246cdf0e10cSrcweir                                 >> ch_p('.')
247cdf0e10cSrcweir                                 >> uint_p[push_back_a(pSelf->m_aUIntStack)]
248cdf0e10cSrcweir                                 >> *((~ch_p('\r') & ~ch_p('\n')))
249cdf0e10cSrcweir                                 >> eol_p
250cdf0e10cSrcweir                              ])[boost::bind(&PDFGrammar::haveFile,pSelf, _1, _2)]
251cdf0e10cSrcweir                           >> *( comment | object | ( xref >> trailer ) );
252cdf0e10cSrcweir             #else
253cdf0e10cSrcweir             pdfrule     = ! (lexeme_d[
254cdf0e10cSrcweir                                 str_p( "%PDF-" )
255cdf0e10cSrcweir                                 >> uint_p[boost::bind(&PDFGrammar::push_back_action_uint, pSelf, _1)]
256cdf0e10cSrcweir                                 >> ch_p('.')
257cdf0e10cSrcweir                                 >> uint_p[boost::bind(&PDFGrammar::push_back_action_uint, pSelf, _1)]
258cdf0e10cSrcweir                                 >> *((~ch_p('\r') & ~ch_p('\n')))
259cdf0e10cSrcweir                                 >> eol_p
260cdf0e10cSrcweir                              ])[boost::bind(&PDFGrammar::haveFile,pSelf, _1, _2)]
261cdf0e10cSrcweir                           >> *( comment | object | ( xref >> trailer ) );
262cdf0e10cSrcweir             #endif
263cdf0e10cSrcweir         }
264cdf0e10cSrcweir         rule< ScannerT > comment, stream, boolean, name, stringtype, null_object, simple_type,
265cdf0e10cSrcweir                          objectref, array, value, dict_element, dict_begin, dict_end,
266cdf0e10cSrcweir                          array_begin, array_end, object, object_begin, object_end,
267cdf0e10cSrcweir                          xref, trailer, pdfrule;
268cdf0e10cSrcweir 
startPDFGrammar::definition269cdf0e10cSrcweir         const rule< ScannerT >& start() const { return pdfrule; }
270cdf0e10cSrcweir     };
271cdf0e10cSrcweir 
272cdf0e10cSrcweir     #ifndef USE_ASSIGN_ACTOR
push_back_action_uint(unsigned int i)273cdf0e10cSrcweir     void push_back_action_uint( unsigned int i )
274cdf0e10cSrcweir     {
275cdf0e10cSrcweir         m_aUIntStack.push_back( i );
276cdf0e10cSrcweir     }
assign_action_double(double d)277cdf0e10cSrcweir     void assign_action_double( double d )
278cdf0e10cSrcweir     {
279cdf0e10cSrcweir         m_fDouble = d;
280cdf0e10cSrcweir     }
281cdf0e10cSrcweir     #endif
282cdf0e10cSrcweir 
parseError(const char * pMessage,iteratorT pLocation)283cdf0e10cSrcweir     void parseError( const char* pMessage, iteratorT pLocation )
284cdf0e10cSrcweir     {
285cdf0e10cSrcweir         throw_( pLocation, pMessage );
286cdf0e10cSrcweir     }
287cdf0e10cSrcweir 
iteratorToString(iteratorT first,iteratorT last) const288cdf0e10cSrcweir     rtl::OString iteratorToString( iteratorT first, iteratorT last ) const
289cdf0e10cSrcweir     {
290cdf0e10cSrcweir         rtl::OStringBuffer aStr( 32 );
291cdf0e10cSrcweir         while( first != last )
292cdf0e10cSrcweir         {
293cdf0e10cSrcweir             aStr.append( *first );
294cdf0e10cSrcweir             ++first;
295cdf0e10cSrcweir         }
296cdf0e10cSrcweir         return aStr.makeStringAndClear();
297cdf0e10cSrcweir     }
298cdf0e10cSrcweir 
haveFile(iteratorT pBegin,iteratorT)299cdf0e10cSrcweir     void haveFile( iteratorT pBegin, iteratorT /*pEnd*/ )
300cdf0e10cSrcweir     {
301cdf0e10cSrcweir         if( m_aObjectStack.empty() )
302cdf0e10cSrcweir         {
303cdf0e10cSrcweir             PDFFile* pFile = new PDFFile();
304cdf0e10cSrcweir             pFile->m_nMinor = m_aUIntStack.back();
305cdf0e10cSrcweir             m_aUIntStack.pop_back();
306cdf0e10cSrcweir             pFile->m_nMajor = m_aUIntStack.back();
307cdf0e10cSrcweir             m_aUIntStack.pop_back();
308cdf0e10cSrcweir             m_aObjectStack.push_back( pFile );
309cdf0e10cSrcweir         }
310cdf0e10cSrcweir         else
311cdf0e10cSrcweir             parseError( "found file header in unusual place", pBegin );
312cdf0e10cSrcweir     }
313cdf0e10cSrcweir 
pushComment(iteratorT first,iteratorT last)314cdf0e10cSrcweir     void pushComment( iteratorT first, iteratorT last )
315cdf0e10cSrcweir     {
316cdf0e10cSrcweir         // add a comment to the current stack element
317cdf0e10cSrcweir         PDFComment* pComment =
318cdf0e10cSrcweir             new PDFComment(iteratorToString(first,last));
319cdf0e10cSrcweir         if( m_aObjectStack.empty() )
320cdf0e10cSrcweir             m_aObjectStack.push_back( new PDFPart() );
321cdf0e10cSrcweir         PDFContainer* pContainer = dynamic_cast<PDFContainer*>(m_aObjectStack.back());
322cdf0e10cSrcweir         if( pContainer == NULL )
323cdf0e10cSrcweir             parseError( "comment without container", first );
324cdf0e10cSrcweir         pContainer->m_aSubElements.push_back( pComment );
325cdf0e10cSrcweir     }
326cdf0e10cSrcweir 
insertNewValue(PDFEntry * pNewValue,iteratorT pPos)327cdf0e10cSrcweir     void insertNewValue( PDFEntry* pNewValue, iteratorT pPos )
328cdf0e10cSrcweir     {
329cdf0e10cSrcweir         PDFContainer* pContainer = NULL;
330cdf0e10cSrcweir         const char* pMsg = NULL;
331cdf0e10cSrcweir         if( ! m_aObjectStack.empty() &&
332cdf0e10cSrcweir             (pContainer = dynamic_cast<PDFContainer*>(m_aObjectStack.back())) != NULL )
333cdf0e10cSrcweir         {
334cdf0e10cSrcweir             if( dynamic_cast<PDFDict*>(pContainer) == NULL      &&
335cdf0e10cSrcweir                 dynamic_cast<PDFArray*>(pContainer) == NULL )
336cdf0e10cSrcweir             {
337cdf0e10cSrcweir                 PDFObject* pObj = dynamic_cast<PDFObject*>(pContainer);
338cdf0e10cSrcweir                 if( pObj )
339cdf0e10cSrcweir                 {
340cdf0e10cSrcweir                     if( pObj->m_pObject == NULL )
341cdf0e10cSrcweir                         pObj->m_pObject = pNewValue;
342cdf0e10cSrcweir                     else
343cdf0e10cSrcweir                     {
344cdf0e10cSrcweir                         pMsg = "second value for object";
345cdf0e10cSrcweir                         pContainer = NULL;
346cdf0e10cSrcweir                     }
347cdf0e10cSrcweir                 }
348cdf0e10cSrcweir                 else if( dynamic_cast<PDFDict*>(pNewValue) )
349cdf0e10cSrcweir                 {
350cdf0e10cSrcweir                     PDFTrailer* pTrailer = dynamic_cast<PDFTrailer*>(pContainer);
351cdf0e10cSrcweir                     if( pTrailer )
352cdf0e10cSrcweir                     {
353cdf0e10cSrcweir                         if( pTrailer->m_pDict == NULL )
354cdf0e10cSrcweir                             pTrailer->m_pDict = dynamic_cast<PDFDict*>(pNewValue);
355cdf0e10cSrcweir                         else
356cdf0e10cSrcweir                             pContainer = NULL;
357cdf0e10cSrcweir                     }
358cdf0e10cSrcweir                     else
359cdf0e10cSrcweir                         pContainer = NULL;
360cdf0e10cSrcweir                 }
361cdf0e10cSrcweir                 else
362cdf0e10cSrcweir                     pContainer = NULL;
363cdf0e10cSrcweir             }
364cdf0e10cSrcweir         }
365cdf0e10cSrcweir         if( pContainer )
366cdf0e10cSrcweir             pContainer->m_aSubElements.push_back( pNewValue );
367cdf0e10cSrcweir         else
368cdf0e10cSrcweir         {
369cdf0e10cSrcweir             if( ! pMsg )
370cdf0e10cSrcweir             {
371cdf0e10cSrcweir                 if( dynamic_cast<PDFContainer*>(pNewValue) )
372cdf0e10cSrcweir                     pMsg = "array without container";
373cdf0e10cSrcweir                 else
374cdf0e10cSrcweir                     pMsg = "value without container";
375cdf0e10cSrcweir             }
376cdf0e10cSrcweir             delete pNewValue;
377cdf0e10cSrcweir             parseError( pMsg, pPos );
378cdf0e10cSrcweir         }
379cdf0e10cSrcweir     }
380cdf0e10cSrcweir 
pushName(iteratorT first,iteratorT last)381cdf0e10cSrcweir     void pushName( iteratorT first, iteratorT last )
382cdf0e10cSrcweir     {
383cdf0e10cSrcweir         insertNewValue( new PDFName(iteratorToString(first,last)), first );
384cdf0e10cSrcweir     }
385cdf0e10cSrcweir 
pushDouble(iteratorT first,iteratorT)386cdf0e10cSrcweir     void pushDouble( iteratorT first, iteratorT /*last*/ )
387cdf0e10cSrcweir     {
388cdf0e10cSrcweir         insertNewValue( new PDFNumber(m_fDouble), first );
389cdf0e10cSrcweir     }
390cdf0e10cSrcweir 
pushString(iteratorT first,iteratorT last)391cdf0e10cSrcweir     void pushString( iteratorT first, iteratorT last )
392cdf0e10cSrcweir     {
393cdf0e10cSrcweir         insertNewValue( new PDFString(iteratorToString(first,last)), first );
394cdf0e10cSrcweir     }
395cdf0e10cSrcweir 
pushBool(iteratorT first,iteratorT last)396cdf0e10cSrcweir     void pushBool( iteratorT first, iteratorT last )
397cdf0e10cSrcweir     {
398cdf0e10cSrcweir         insertNewValue( new PDFBool( (last-first == 4) ), first );
399cdf0e10cSrcweir     }
400cdf0e10cSrcweir 
pushNull(iteratorT first,iteratorT)401cdf0e10cSrcweir     void pushNull( iteratorT first, iteratorT )
402cdf0e10cSrcweir     {
403cdf0e10cSrcweir         insertNewValue( new PDFNull(), first );
404cdf0e10cSrcweir     }
405cdf0e10cSrcweir 
406cdf0e10cSrcweir 
beginObject(iteratorT first,iteratorT)407cdf0e10cSrcweir     void beginObject( iteratorT first, iteratorT /*last*/ )
408cdf0e10cSrcweir     {
409cdf0e10cSrcweir         if( m_aObjectStack.empty() )
410cdf0e10cSrcweir             m_aObjectStack.push_back( new PDFPart() );
411cdf0e10cSrcweir 
412cdf0e10cSrcweir         unsigned int nGeneration = m_aUIntStack.back();
413cdf0e10cSrcweir         m_aUIntStack.pop_back();
414cdf0e10cSrcweir         unsigned int nObject = m_aUIntStack.back();
415cdf0e10cSrcweir         m_aUIntStack.pop_back();
416cdf0e10cSrcweir 
417cdf0e10cSrcweir         PDFObject* pObj = new PDFObject( nObject, nGeneration );
418cdf0e10cSrcweir         pObj->m_nOffset = first - m_aGlobalBegin;
419cdf0e10cSrcweir 
420cdf0e10cSrcweir         PDFContainer* pContainer = dynamic_cast<PDFContainer*>(m_aObjectStack.back());
421cdf0e10cSrcweir         if( pContainer &&
422cdf0e10cSrcweir             ( dynamic_cast<PDFFile*>(pContainer) ||
423cdf0e10cSrcweir               dynamic_cast<PDFPart*>(pContainer) ) )
424cdf0e10cSrcweir         {
425cdf0e10cSrcweir             pContainer->m_aSubElements.push_back( pObj );
426cdf0e10cSrcweir             m_aObjectStack.push_back( pObj );
427cdf0e10cSrcweir         }
428cdf0e10cSrcweir         else
429cdf0e10cSrcweir             parseError( "object in wrong place", first );
430cdf0e10cSrcweir     }
431cdf0e10cSrcweir 
endObject(iteratorT first,iteratorT)432cdf0e10cSrcweir     void endObject( iteratorT first, iteratorT )
433cdf0e10cSrcweir     {
434cdf0e10cSrcweir         if( m_aObjectStack.empty() )
435cdf0e10cSrcweir             parseError( "endobj without obj", first );
436cdf0e10cSrcweir         else if( dynamic_cast<PDFObject*>(m_aObjectStack.back()) == NULL )
437cdf0e10cSrcweir             parseError( "spurious endobj", first );
438cdf0e10cSrcweir         else
439cdf0e10cSrcweir             m_aObjectStack.pop_back();
440cdf0e10cSrcweir     }
441cdf0e10cSrcweir 
pushObjectRef(iteratorT first,iteratorT)442cdf0e10cSrcweir     void pushObjectRef( iteratorT first, iteratorT )
443cdf0e10cSrcweir     {
444cdf0e10cSrcweir         unsigned int nGeneration = m_aUIntStack.back();
445cdf0e10cSrcweir         m_aUIntStack.pop_back();
446cdf0e10cSrcweir         unsigned int nObject = m_aUIntStack.back();
447cdf0e10cSrcweir         m_aUIntStack.pop_back();
448cdf0e10cSrcweir         insertNewValue( new PDFObjectRef(nObject,nGeneration), first );
449cdf0e10cSrcweir     }
450cdf0e10cSrcweir 
beginDict(iteratorT first,iteratorT)451cdf0e10cSrcweir     void beginDict( iteratorT first, iteratorT )
452cdf0e10cSrcweir     {
453cdf0e10cSrcweir         PDFDict* pDict = new PDFDict();
454cdf0e10cSrcweir         pDict->m_nOffset = first - m_aGlobalBegin;
455cdf0e10cSrcweir 
456cdf0e10cSrcweir         insertNewValue( pDict, first );
457cdf0e10cSrcweir         // will not come here if insertion fails (exception)
458cdf0e10cSrcweir         m_aObjectStack.push_back( pDict );
459cdf0e10cSrcweir     }
endDict(iteratorT first,iteratorT)460cdf0e10cSrcweir     void endDict( iteratorT first, iteratorT )
461cdf0e10cSrcweir     {
462cdf0e10cSrcweir         PDFDict* pDict = NULL;
463cdf0e10cSrcweir         if( m_aObjectStack.empty() )
464cdf0e10cSrcweir             parseError( "dictionary end without begin", first );
465cdf0e10cSrcweir         else if( (pDict = dynamic_cast<PDFDict*>(m_aObjectStack.back())) == NULL )
466cdf0e10cSrcweir             parseError( "spurious dictionary end", first );
467cdf0e10cSrcweir         else
468cdf0e10cSrcweir             m_aObjectStack.pop_back();
469cdf0e10cSrcweir 
470cdf0e10cSrcweir         PDFEntry* pOffender = pDict->buildMap();
471cdf0e10cSrcweir         if( pOffender )
472cdf0e10cSrcweir         {
473cdf0e10cSrcweir             StringEmitContext aCtx;
474cdf0e10cSrcweir             aCtx.write( "offending dictionary element: ", 30 );
475cdf0e10cSrcweir             pOffender->emit( aCtx );
476cdf0e10cSrcweir             m_aErrorString = aCtx.getString();
477cdf0e10cSrcweir             parseError( m_aErrorString.getStr(), first );
478cdf0e10cSrcweir         }
479cdf0e10cSrcweir     }
480cdf0e10cSrcweir 
beginArray(iteratorT first,iteratorT)481cdf0e10cSrcweir     void beginArray( iteratorT first, iteratorT )
482cdf0e10cSrcweir     {
483cdf0e10cSrcweir         PDFArray* pArray = new PDFArray();
484cdf0e10cSrcweir         pArray->m_nOffset = first - m_aGlobalBegin;
485cdf0e10cSrcweir 
486cdf0e10cSrcweir         insertNewValue( pArray, first );
487cdf0e10cSrcweir         // will not come here if insertion fails (exception)
488cdf0e10cSrcweir         m_aObjectStack.push_back( pArray );
489cdf0e10cSrcweir     }
490cdf0e10cSrcweir 
endArray(iteratorT first,iteratorT)491cdf0e10cSrcweir     void endArray( iteratorT first, iteratorT )
492cdf0e10cSrcweir     {
493cdf0e10cSrcweir         if( m_aObjectStack.empty() )
494cdf0e10cSrcweir             parseError( "array end without begin", first );
495cdf0e10cSrcweir         else if( dynamic_cast<PDFArray*>(m_aObjectStack.back()) == NULL )
496cdf0e10cSrcweir             parseError( "spurious array end", first );
497cdf0e10cSrcweir         else
498cdf0e10cSrcweir             m_aObjectStack.pop_back();
499cdf0e10cSrcweir     }
500cdf0e10cSrcweir 
emitStream(iteratorT first,iteratorT last)501cdf0e10cSrcweir     void emitStream( iteratorT first, iteratorT last )
502cdf0e10cSrcweir     {
503cdf0e10cSrcweir         if( m_aObjectStack.empty() )
504cdf0e10cSrcweir             parseError( "stream without object", first );
505cdf0e10cSrcweir         PDFObject* pObj = dynamic_cast<PDFObject*>(m_aObjectStack.back());
506cdf0e10cSrcweir         if( pObj && pObj->m_pObject )
507cdf0e10cSrcweir         {
508cdf0e10cSrcweir             if( pObj->m_pStream )
509cdf0e10cSrcweir                 parseError( "multiple streams in object", first );
510cdf0e10cSrcweir 
511cdf0e10cSrcweir             PDFDict* pDict = dynamic_cast<PDFDict*>(pObj->m_pObject);
512cdf0e10cSrcweir             if( pDict )
513cdf0e10cSrcweir             {
514cdf0e10cSrcweir                 PDFStream* pStream = new PDFStream( first - m_aGlobalBegin, last - m_aGlobalBegin, pDict );
515cdf0e10cSrcweir 
516cdf0e10cSrcweir                 pObj->m_pStream = pStream;
517cdf0e10cSrcweir                 pObj->m_aSubElements.push_back( pStream );
518cdf0e10cSrcweir             }
519cdf0e10cSrcweir         }
520cdf0e10cSrcweir         else
521cdf0e10cSrcweir             parseError( "stream without object", first );
522cdf0e10cSrcweir     }
523cdf0e10cSrcweir 
beginTrailer(iteratorT first,iteratorT)524cdf0e10cSrcweir     void beginTrailer( iteratorT first, iteratorT )
525cdf0e10cSrcweir     {
526cdf0e10cSrcweir         if( m_aObjectStack.empty() )
527cdf0e10cSrcweir             m_aObjectStack.push_back( new PDFPart() );
528cdf0e10cSrcweir 
529cdf0e10cSrcweir         PDFTrailer* pTrailer = new PDFTrailer();
530cdf0e10cSrcweir         pTrailer->m_nOffset = first - m_aGlobalBegin;
531cdf0e10cSrcweir 
532cdf0e10cSrcweir         PDFContainer* pContainer = dynamic_cast<PDFContainer*>(m_aObjectStack.back());
533cdf0e10cSrcweir         if( pContainer &&
534cdf0e10cSrcweir             ( dynamic_cast<PDFFile*>(pContainer) ||
535cdf0e10cSrcweir               dynamic_cast<PDFPart*>(pContainer) ) )
536cdf0e10cSrcweir         {
537cdf0e10cSrcweir             pContainer->m_aSubElements.push_back( pTrailer );
538cdf0e10cSrcweir             m_aObjectStack.push_back( pTrailer );
539cdf0e10cSrcweir         }
540cdf0e10cSrcweir         else
541cdf0e10cSrcweir             parseError( "trailer in wrong place", first );
542cdf0e10cSrcweir     }
543cdf0e10cSrcweir 
endTrailer(iteratorT first,iteratorT)544cdf0e10cSrcweir     void endTrailer( iteratorT first, iteratorT )
545cdf0e10cSrcweir     {
546cdf0e10cSrcweir         if( m_aObjectStack.empty() )
547cdf0e10cSrcweir             parseError( "%%EOF without trailer", first );
548cdf0e10cSrcweir         else if( dynamic_cast<PDFTrailer*>(m_aObjectStack.back()) == NULL )
549cdf0e10cSrcweir             parseError( "spurious %%EOF", first );
550cdf0e10cSrcweir         else
551cdf0e10cSrcweir             m_aObjectStack.pop_back();
552cdf0e10cSrcweir     }
553cdf0e10cSrcweir };
554cdf0e10cSrcweir 
read(const char * pBuffer,unsigned int nLen)555cdf0e10cSrcweir PDFEntry* PDFReader::read( const char* pBuffer, unsigned int nLen )
556cdf0e10cSrcweir {
557cdf0e10cSrcweir     PDFGrammar<const char*> aGrammar( pBuffer );
558cdf0e10cSrcweir 
559cdf0e10cSrcweir     try
560cdf0e10cSrcweir     {
561cdf0e10cSrcweir         boost::spirit::parse_info<const char*> aInfo =
562cdf0e10cSrcweir             boost::spirit::parse( pBuffer,
563cdf0e10cSrcweir                                   pBuffer+nLen,
564cdf0e10cSrcweir                                   aGrammar,
565cdf0e10cSrcweir                                   boost::spirit::space_p );
566cdf0e10cSrcweir         #if OSL_DEBUG_LEVEL > 1
567cdf0e10cSrcweir         fprintf( stderr, "parseinfo: stop = %p (buff=%p, offset = %d), hit = %s, full = %s, length = %d\n",
568cdf0e10cSrcweir                  aInfo.stop, pBuffer, aInfo.stop - pBuffer,
569cdf0e10cSrcweir                  aInfo.hit ? "true" : "false",
570cdf0e10cSrcweir                  aInfo.full ? "true" : "false",
571cdf0e10cSrcweir                  (int)aInfo.length );
572cdf0e10cSrcweir         #endif
573cdf0e10cSrcweir     }
574cdf0e10cSrcweir     catch( parser_error<const char*, const char*>& rError )
575cdf0e10cSrcweir     {
576cdf0e10cSrcweir         #if OSL_DEBUG_LEVEL > 1
577cdf0e10cSrcweir         fprintf( stderr, "parse error: %s at buffer pos %u\nobject stack:\n",
578cdf0e10cSrcweir                  rError.descriptor, rError.where - pBuffer );
579cdf0e10cSrcweir         unsigned int nElem = aGrammar.m_aObjectStack.size();
580cdf0e10cSrcweir         for( unsigned int i = 0; i < nElem; i++ )
581cdf0e10cSrcweir         {
582cdf0e10cSrcweir             fprintf( stderr, "   %s\n", typeid( *(aGrammar.m_aObjectStack[i]) ).name() );
583cdf0e10cSrcweir         }
584cdf0e10cSrcweir         #endif
585cdf0e10cSrcweir     }
586cdf0e10cSrcweir 
587cdf0e10cSrcweir     PDFEntry* pRet = NULL;
588cdf0e10cSrcweir     unsigned int nEntries = aGrammar.m_aObjectStack.size();
589cdf0e10cSrcweir     if( nEntries == 1 )
590cdf0e10cSrcweir     {
591cdf0e10cSrcweir         pRet = aGrammar.m_aObjectStack.back();
592cdf0e10cSrcweir         aGrammar.m_aObjectStack.pop_back();
593cdf0e10cSrcweir     }
594cdf0e10cSrcweir     #if OSL_DEBUG_LEVEL > 1
595cdf0e10cSrcweir     else if( nEntries > 1 )
596cdf0e10cSrcweir         fprintf( stderr, "error got %u stack objects in parse\n", nEntries );
597cdf0e10cSrcweir     #endif
598cdf0e10cSrcweir 
599cdf0e10cSrcweir     return pRet;
600cdf0e10cSrcweir }
601cdf0e10cSrcweir 
read(const char * pFileName)602cdf0e10cSrcweir PDFEntry* PDFReader::read( const char* pFileName )
603cdf0e10cSrcweir {
604cdf0e10cSrcweir     #ifdef WIN32
605cdf0e10cSrcweir     /* #i106583#
606cdf0e10cSrcweir        since converting to boost 1.39 file_iterator does not work anymore on all Windows systems
607cdf0e10cSrcweir        C++ stdlib istream_iterator does not allow "-" apparently
608cdf0e10cSrcweir        using spirit 2.0 doesn't work in our environment with the MSC
609cdf0e10cSrcweir 
610cdf0e10cSrcweir        So for the time being bite the bullet and read the whole file.
611cdf0e10cSrcweir        FIXME: give Spirit 2.x another try when we upgrade boost again.
612cdf0e10cSrcweir     */
613cdf0e10cSrcweir     PDFEntry* pRet = NULL;
614cdf0e10cSrcweir     FILE* fp = fopen( pFileName, "rb" );
615cdf0e10cSrcweir     if( fp )
616cdf0e10cSrcweir     {
617cdf0e10cSrcweir         fseek( fp, 0, SEEK_END );
618cdf0e10cSrcweir         unsigned int nLen = (unsigned int)ftell( fp );
619cdf0e10cSrcweir         fseek( fp, 0, SEEK_SET );
620cdf0e10cSrcweir         char* pBuf = (char*)rtl_allocateMemory( nLen );
621cdf0e10cSrcweir         if( pBuf )
622cdf0e10cSrcweir         {
623cdf0e10cSrcweir             fread( pBuf, 1, nLen, fp );
624cdf0e10cSrcweir             pRet = read( pBuf, nLen );
625cdf0e10cSrcweir             rtl_freeMemory( pBuf );
626cdf0e10cSrcweir         }
627cdf0e10cSrcweir         fclose( fp );
628cdf0e10cSrcweir     }
629cdf0e10cSrcweir     return pRet;
630cdf0e10cSrcweir     #else
631cdf0e10cSrcweir     file_iterator<> file_start( pFileName );
632cdf0e10cSrcweir     if( ! file_start )
633cdf0e10cSrcweir         return NULL;
634cdf0e10cSrcweir     file_iterator<> file_end = file_start.make_end();
635cdf0e10cSrcweir     PDFGrammar< file_iterator<> > aGrammar( file_start );
636cdf0e10cSrcweir 
637cdf0e10cSrcweir     try
638cdf0e10cSrcweir     {
639cdf0e10cSrcweir         boost::spirit::parse_info< file_iterator<> > aInfo =
640cdf0e10cSrcweir             boost::spirit::parse( file_start,
641cdf0e10cSrcweir                                   file_end,
642cdf0e10cSrcweir                                   aGrammar,
643cdf0e10cSrcweir                                   boost::spirit::space_p );
644cdf0e10cSrcweir         #if OSL_DEBUG_LEVEL > 1
645cdf0e10cSrcweir         fprintf( stderr, "parseinfo: stop at offset = %d, hit = %s, full = %s, length = %d\n",
646cdf0e10cSrcweir                  aInfo.stop - file_start,
647cdf0e10cSrcweir                  aInfo.hit ? "true" : "false",
648cdf0e10cSrcweir                  aInfo.full ? "true" : "false",
649cdf0e10cSrcweir                  (int)aInfo.length );
650cdf0e10cSrcweir         #endif
651cdf0e10cSrcweir     }
652cdf0e10cSrcweir     catch( parser_error< const char*, file_iterator<> >& rError )
653cdf0e10cSrcweir     {
654cdf0e10cSrcweir         #if OSL_DEBUG_LEVEL > 1
655cdf0e10cSrcweir         fprintf( stderr, "parse error: %s at buffer pos %u\nobject stack:\n",
656cdf0e10cSrcweir                  rError.descriptor, rError.where - file_start );
657cdf0e10cSrcweir         unsigned int nElem = aGrammar.m_aObjectStack.size();
658cdf0e10cSrcweir         for( unsigned int i = 0; i < nElem; i++ )
659cdf0e10cSrcweir         {
660cdf0e10cSrcweir             fprintf( stderr, "   %s\n", typeid( *(aGrammar.m_aObjectStack[i]) ).name() );
661cdf0e10cSrcweir         }
662cdf0e10cSrcweir         #endif
663cdf0e10cSrcweir     }
664cdf0e10cSrcweir 
665cdf0e10cSrcweir     PDFEntry* pRet = NULL;
666cdf0e10cSrcweir     unsigned int nEntries = aGrammar.m_aObjectStack.size();
667cdf0e10cSrcweir     if( nEntries == 1 )
668cdf0e10cSrcweir     {
669cdf0e10cSrcweir         pRet = aGrammar.m_aObjectStack.back();
670cdf0e10cSrcweir         aGrammar.m_aObjectStack.pop_back();
671cdf0e10cSrcweir     }
672cdf0e10cSrcweir     #if OSL_DEBUG_LEVEL > 1
673cdf0e10cSrcweir     else if( nEntries > 1 )
674cdf0e10cSrcweir     {
675cdf0e10cSrcweir         fprintf( stderr, "error got %u stack objects in parse\n", nEntries );
676cdf0e10cSrcweir         for( unsigned int i = 0; i < nEntries; i++ )
677cdf0e10cSrcweir         {
678cdf0e10cSrcweir             fprintf( stderr, "%s\n", typeid(*aGrammar.m_aObjectStack[i]).name() );
679cdf0e10cSrcweir             PDFObject* pObj = dynamic_cast<PDFObject*>(aGrammar.m_aObjectStack[i]);
680cdf0e10cSrcweir             if( pObj )
681cdf0e10cSrcweir                 fprintf( stderr, "   -> object %d generation %d\n", pObj->m_nNumber, pObj->m_nGeneration );
682cdf0e10cSrcweir             else
683cdf0e10cSrcweir                 fprintf( stderr, "(type %s)\n", typeid(*aGrammar.m_aObjectStack[i]).name() );
684cdf0e10cSrcweir         }
685cdf0e10cSrcweir     }
686cdf0e10cSrcweir     #endif
687cdf0e10cSrcweir     return pRet;
688cdf0e10cSrcweir     #endif // WIN32
689cdf0e10cSrcweir }
690cdf0e10cSrcweir 
691cdf0e10cSrcweir #if defined __SUNPRO_CC
692cdf0e10cSrcweir #pragma enable_warn
693cdf0e10cSrcweir #elif defined _MSC_VER
694cdf0e10cSrcweir #pragma warning(pop)
695cdf0e10cSrcweir #endif
696cdf0e10cSrcweir 
697cdf0e10cSrcweir 
698