1 /************************************************************** 2 * 3 * Licensed to the Apache Software Foundation (ASF) under one 4 * or more contributor license agreements. See the NOTICE file 5 * distributed with this work for additional information 6 * regarding copyright ownership. The ASF licenses this file 7 * to you under the Apache License, Version 2.0 (the 8 * "License"); you may not use this file except in compliance 9 * with the License. You may obtain a copy of the License at 10 * 11 * http://www.apache.org/licenses/LICENSE-2.0 12 * 13 * Unless required by applicable law or agreed to in writing, 14 * software distributed under the License is distributed on an 15 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 * KIND, either express or implied. See the License for the 17 * specific language governing permissions and limitations 18 * under the License. 19 * 20 *************************************************************/ 21 22 23 24 // MARKER(update_precomp.py): autogen include statement, do not remove 25 #include "precompiled_sdext.hxx" 26 27 #include <stdio.h> 28 #include <sal/main.h> 29 #include <osl/file.h> 30 #include <osl/thread.h> 31 #include <rtl/alloc.h> 32 #include <rtl/ustring.hxx> 33 #include <rtl/strbuf.hxx> 34 35 #include "pdfparse.hxx" 36 37 using namespace rtl; 38 using namespace pdfparse; 39 40 void printHelp( const char* pExe ) 41 { 42 fprintf( stdout, 43 "USAGE: %s [-h,--help]\n" 44 " %s [-pw, --password <password>] <inputfile> [<outputfile>]\n" 45 " %s <-a, --extract-add-streams> [-pw, --password <password>] <inputfile> [<outputfile>]\n" 46 " %s <-f, --extract-fonts> [-pw, --password <password>] <inputfile> [<outputfile>]\n" 47 " %s <-o, --extract-objects> <o0>[:<g0>][,<o1>[:g1][,...]] [-pw, --password <password>] <inputfile> [<outputfile>]\n" 48 " -h, --help: show help\n" 49 " -a, --extract-add-streams: extracts additional streams to outputfile_object\n" 50 " and prints the mimetype found to stdout\n" 51 " -f, --extract-fonts: extracts fonts (currently only type1 and truetype are supported\n" 52 " -o, --extract-objects: extracts object streams, the syntax of the argument is comma separated\n" 53 " object numbers, where object number and generation number are separated by \':\'\n" 54 " an omitted generation number defaults to 0\n" 55 " -pw, --password: use password for decryption\n" 56 "\n" 57 "note: -f, -a, -o and normal unzip operation are mutually exclusive\n" 58 , pExe, pExe, pExe, pExe, pExe ); 59 } 60 61 class FileEmitContext : public EmitContext 62 { 63 oslFileHandle m_aHandle; 64 oslFileHandle m_aReadHandle; 65 unsigned int m_nReadLen; 66 67 void openReadFile( const char* pOrigName ); 68 69 public: 70 FileEmitContext( const char* pFileName, const char* pOrigName, const PDFContainer* pTop ); 71 virtual ~FileEmitContext(); 72 73 virtual bool write( const void* pBuf, unsigned int nLen ) throw(); 74 virtual unsigned int getCurPos() throw(); 75 virtual bool copyOrigBytes( unsigned int nOrigOffset, unsigned int nLen ) throw(); 76 virtual unsigned int readOrigBytes( unsigned int nOrigOffset, unsigned int nLen, void* pBuf ) throw(); 77 }; 78 79 FileEmitContext::FileEmitContext( const char* pFileName, const char* pOrigName, const PDFContainer* pTop ) 80 : EmitContext( pTop ), 81 m_aHandle( NULL ), 82 m_aReadHandle( NULL ), 83 m_nReadLen( 0 ) 84 { 85 OUString aSysFile( OStringToOUString( OString( pFileName ), osl_getThreadTextEncoding() ) ); 86 OUString aURL; 87 if( osl_getFileURLFromSystemPath( aSysFile.pData, &aURL.pData ) != osl_File_E_None ) 88 { 89 fprintf( stderr, "filename conversion \"%s\" failed\n", pFileName ); 90 return; 91 } 92 93 if( osl_openFile( aURL.pData, &m_aHandle, osl_File_OpenFlag_Write ) == osl_File_E_None ) 94 { 95 if( osl_setFileSize( m_aHandle, 0 ) != osl_File_E_None ) 96 { 97 fprintf( stderr, "could not truncate %s\n", pFileName ); 98 osl_closeFile( m_aHandle ); 99 m_aHandle = NULL; 100 } 101 } 102 else if( osl_openFile( aURL.pData, &m_aHandle, 103 osl_File_OpenFlag_Write |osl_File_OpenFlag_Create ) != osl_File_E_None ) 104 { 105 fprintf( stderr, "could not open %s\n", pFileName ); 106 return; 107 } 108 m_bDeflate = true; 109 110 openReadFile( pOrigName ); 111 } 112 113 FileEmitContext::~FileEmitContext() 114 { 115 if( m_aHandle ) 116 osl_closeFile( m_aHandle ); 117 if( m_aReadHandle ) 118 osl_closeFile( m_aReadHandle ); 119 } 120 121 void FileEmitContext::openReadFile( const char* pInFile ) 122 { 123 OUString aSysFile( OStringToOUString( OString( pInFile ), osl_getThreadTextEncoding() ) ); 124 OUString aURL; 125 if( osl_getFileURLFromSystemPath( aSysFile.pData, &aURL.pData ) != osl_File_E_None ) 126 { 127 fprintf( stderr, "filename conversion \"%s\" failed\n", pInFile ); 128 return; 129 } 130 131 if( osl_openFile( aURL.pData, &m_aReadHandle, osl_File_OpenFlag_Read ) != osl_File_E_None ) 132 { 133 fprintf( stderr, "could not open %s\n", pInFile ); 134 return; 135 } 136 137 if( osl_setFilePos( m_aReadHandle, osl_Pos_End, 0 ) != osl_File_E_None ) 138 { 139 fprintf( stderr, "could not seek to end of %s\n", pInFile ); 140 osl_closeFile( m_aReadHandle ); 141 return; 142 } 143 144 sal_uInt64 nFileSize = 0; 145 if( osl_getFilePos( m_aReadHandle, &nFileSize ) != osl_File_E_None ) 146 { 147 fprintf( stderr, "could not get end pos of %s\n", pInFile ); 148 osl_closeFile( m_aReadHandle ); 149 return; 150 } 151 152 m_nReadLen = static_cast<unsigned int>(nFileSize); 153 } 154 155 bool FileEmitContext::write( const void* pBuf, unsigned int nLen ) throw() 156 { 157 if( ! m_aHandle ) 158 return false; 159 160 sal_uInt64 nWrite = static_cast<sal_uInt64>(nLen); 161 sal_uInt64 nWritten = 0; 162 return (osl_writeFile( m_aHandle, pBuf, nWrite, &nWritten ) == osl_File_E_None) 163 && nWrite == nWritten; 164 } 165 166 unsigned int FileEmitContext::getCurPos() throw() 167 { 168 sal_uInt64 nFileSize = 0; 169 if( m_aHandle ) 170 { 171 if( osl_getFilePos( m_aHandle, &nFileSize ) != osl_File_E_None ) 172 nFileSize = 0; 173 } 174 return static_cast<unsigned int>(nFileSize); 175 } 176 177 bool FileEmitContext::copyOrigBytes( unsigned int nOrigOffset, unsigned int nLen ) throw() 178 { 179 if( nOrigOffset + nLen > m_nReadLen ) 180 return false; 181 182 if( osl_setFilePos( m_aReadHandle, osl_Pos_Absolut, nOrigOffset ) != osl_File_E_None ) 183 { 184 fprintf( stderr, "could not seek to offset %u\n", nOrigOffset ); 185 return false; 186 } 187 void* pBuf = rtl_allocateMemory( nLen ); 188 if( ! pBuf ) 189 return false; 190 sal_uInt64 nBytesRead = 0; 191 if( osl_readFile( m_aReadHandle, pBuf, nLen, &nBytesRead ) != osl_File_E_None 192 || nBytesRead != static_cast<sal_uInt64>(nLen) ) 193 { 194 fprintf( stderr, "could not read %u bytes\n", nLen ); 195 rtl_freeMemory( pBuf ); 196 return false; 197 } 198 bool bRet = write( pBuf, nLen ); 199 rtl_freeMemory( pBuf ); 200 return bRet; 201 } 202 203 unsigned int FileEmitContext::readOrigBytes( unsigned int nOrigOffset, unsigned int nLen, void* pBuf ) throw() 204 { 205 if( nOrigOffset + nLen > m_nReadLen ) 206 return 0; 207 208 if( osl_setFilePos( m_aReadHandle, osl_Pos_Absolut, nOrigOffset ) != osl_File_E_None ) 209 { 210 fprintf( stderr, "could not seek to offset %u\n", nOrigOffset ); 211 return 0; 212 } 213 sal_uInt64 nBytesRead = 0; 214 if( osl_readFile( m_aReadHandle, pBuf, nLen, &nBytesRead ) != osl_File_E_None ) 215 return 0; 216 return static_cast<unsigned int>(nBytesRead); 217 } 218 219 typedef int(*PDFFileHdl)(const char*, const char*, PDFFile*); 220 221 int handleFile( const char* pInFile, const char* pOutFile, const char* pPassword, PDFFileHdl pHdl ) 222 { 223 224 PDFReader aParser; 225 int nRet = 0; 226 PDFEntry* pEntry = aParser.read( pInFile ); 227 if( pEntry ) 228 { 229 PDFFile* pPDFFile = dynamic_cast<PDFFile*>(pEntry); 230 if( pPDFFile ) 231 { 232 fprintf( stdout, "have a %s PDF file\n", pPDFFile->isEncrypted() ? "encrypted" : "unencrypted" ); 233 if( pPassword ) 234 fprintf( stdout, "password %s\n", 235 pPDFFile->setupDecryptionData( pPassword ) ? "matches" : "does not match" ); 236 nRet = pHdl( pInFile, pOutFile, pPDFFile ); 237 } 238 else 239 nRet = 20; 240 delete pEntry; 241 } 242 return nRet; 243 } 244 245 int write_unzipFile( const char* pInFile, const char* pOutFile, PDFFile* pPDFFile ) 246 { 247 FileEmitContext aContext( pOutFile, pInFile, pPDFFile ); 248 aContext.m_bDecrypt = pPDFFile->isEncrypted(); 249 pPDFFile->emit(aContext); 250 return 0; 251 } 252 253 int write_addStreamArray( const char* pOutFile, PDFArray* pStreams, PDFFile* pPDFFile, const char* pInFile ) 254 { 255 int nRet = 0; 256 unsigned int nArrayElements = pStreams->m_aSubElements.size(); 257 for( unsigned int i = 0; i < nArrayElements-1 && nRet == 0; i++ ) 258 { 259 PDFName* pMimeType = dynamic_cast<PDFName*>(pStreams->m_aSubElements[i]); 260 PDFObjectRef* pStreamRef = dynamic_cast<PDFObjectRef*>(pStreams->m_aSubElements[i+1]); 261 if( ! pMimeType ) 262 fprintf( stderr, "error: no mimetype element\n" ); 263 if( ! pStreamRef ) 264 fprintf( stderr, "error: no stream ref element\n" ); 265 if( pMimeType && pStreamRef ) 266 { 267 fprintf( stdout, "found stream %d %d with mimetype %s\n", 268 pStreamRef->m_nNumber, pStreamRef->m_nGeneration, 269 pMimeType->m_aName.getStr() ); 270 PDFObject* pObject = pPDFFile->findObject( pStreamRef->m_nNumber, pStreamRef->m_nGeneration ); 271 if( pObject ) 272 { 273 rtl::OStringBuffer aOutStream( pOutFile ); 274 aOutStream.append( "_stream_" ); 275 aOutStream.append( sal_Int32(pStreamRef->m_nNumber) ); 276 aOutStream.append( "_" ); 277 aOutStream.append( sal_Int32(pStreamRef->m_nGeneration) ); 278 FileEmitContext aContext( aOutStream.getStr(), pInFile, pPDFFile ); 279 aContext.m_bDecrypt = pPDFFile->isEncrypted(); 280 pObject->writeStream( aContext, pPDFFile ); 281 } 282 else 283 { 284 fprintf( stderr, "object not found\n" ); 285 nRet = 121; 286 } 287 } 288 else 289 nRet = 120; 290 } 291 return nRet; 292 } 293 294 int write_addStreams( const char* pInFile, const char* pOutFile, PDFFile* pPDFFile ) 295 { 296 // find all trailers 297 int nRet = 0; 298 unsigned int nElements = pPDFFile->m_aSubElements.size(); 299 for( unsigned i = 0; i < nElements && nRet == 0; i++ ) 300 { 301 PDFTrailer* pTrailer = dynamic_cast<PDFTrailer*>(pPDFFile->m_aSubElements[i]); 302 if( pTrailer && pTrailer->m_pDict ) 303 { 304 // search for AdditionalStreams entry 305 std::hash_map<rtl::OString,PDFEntry*,rtl::OStringHash>::iterator add_stream; 306 add_stream = pTrailer->m_pDict->m_aMap.find( "AdditionalStreams" ); 307 if( add_stream != pTrailer->m_pDict->m_aMap.end() ) 308 { 309 PDFArray* pStreams = dynamic_cast<PDFArray*>(add_stream->second); 310 if( pStreams ) 311 nRet = write_addStreamArray( pOutFile, pStreams, pPDFFile, pInFile ); 312 } 313 } 314 } 315 return nRet; 316 } 317 318 int write_fonts( const char* i_pInFile, const char* i_pOutFile, PDFFile* i_pPDFFile ) 319 { 320 int nRet = 0; 321 unsigned int nElements = i_pPDFFile->m_aSubElements.size(); 322 for( unsigned i = 0; i < nElements && nRet == 0; i++ ) 323 { 324 // search FontDescriptors 325 PDFObject* pObj = dynamic_cast<PDFObject*>(i_pPDFFile->m_aSubElements[i]); 326 if( ! pObj ) 327 continue; 328 PDFDict* pDict = dynamic_cast<PDFDict*>(pObj->m_pObject); 329 if( ! pDict ) 330 continue; 331 332 std::hash_map<rtl::OString,PDFEntry*,rtl::OStringHash>::iterator map_it = 333 pDict->m_aMap.find( "Type" ); 334 if( map_it == pDict->m_aMap.end() ) 335 continue; 336 337 PDFName* pName = dynamic_cast<PDFName*>(map_it->second); 338 if( ! pName ) 339 continue; 340 if( ! pName->m_aName.equals( "FontDescriptor" ) ) 341 continue; 342 343 // the font name will be helpful, also there must be one in 344 // a font descriptor 345 map_it = pDict->m_aMap.find( "FontName" ); 346 if( map_it == pDict->m_aMap.end() ) 347 continue; 348 pName = dynamic_cast<PDFName*>(map_it->second); 349 if( ! pName ) 350 continue; 351 rtl::OString aFontName( pName->m_aName ); 352 353 PDFObjectRef* pStreamRef = 0; 354 const char* pFileType = NULL; 355 // we have a font descriptor, try for a type 1 font 356 map_it = pDict->m_aMap.find( "FontFile" ); 357 if( map_it != pDict->m_aMap.end() ) 358 { 359 pStreamRef = dynamic_cast<PDFObjectRef*>(map_it->second); 360 if( pStreamRef ) 361 pFileType = "pfa"; 362 } 363 364 // perhaps it's a truetype file ? 365 if( ! pStreamRef ) 366 { 367 map_it = pDict->m_aMap.find( "FontFile2" ); 368 if( map_it != pDict->m_aMap.end() ) 369 { 370 pStreamRef = dynamic_cast<PDFObjectRef*>(map_it->second); 371 if( pStreamRef ) 372 pFileType = "ttf"; 373 } 374 } 375 376 if( ! pStreamRef ) 377 continue; 378 379 PDFObject* pStream = i_pPDFFile->findObject( pStreamRef ); 380 if( ! pStream ) 381 continue; 382 383 rtl::OStringBuffer aOutStream( i_pOutFile ); 384 aOutStream.append( "_font_" ); 385 aOutStream.append( sal_Int32(pStreamRef->m_nNumber) ); 386 aOutStream.append( "_" ); 387 aOutStream.append( sal_Int32(pStreamRef->m_nGeneration) ); 388 aOutStream.append( "_" ); 389 aOutStream.append( aFontName ); 390 if( pFileType ) 391 { 392 aOutStream.append( "." ); 393 aOutStream.append( pFileType ); 394 } 395 FileEmitContext aContext( aOutStream.getStr(), i_pInFile, i_pPDFFile ); 396 aContext.m_bDecrypt = i_pPDFFile->isEncrypted(); 397 pStream->writeStream( aContext, i_pPDFFile ); 398 } 399 return nRet; 400 } 401 402 std::vector< std::pair< sal_Int32, sal_Int32 > > s_aEmitObjects; 403 404 int write_objects( const char* i_pInFile, const char* i_pOutFile, PDFFile* i_pPDFFile ) 405 { 406 int nRet = 0; 407 unsigned int nElements = s_aEmitObjects.size(); 408 for( unsigned i = 0; i < nElements && nRet == 0; i++ ) 409 { 410 sal_Int32 nObject = s_aEmitObjects[i].first; 411 sal_Int32 nGeneration = s_aEmitObjects[i].second; 412 PDFObject* pStream = i_pPDFFile->findObject( nObject, nGeneration ); 413 if( ! pStream ) 414 { 415 fprintf( stderr, "object %d %d not found !\n", (int)nObject, (int)nGeneration ); 416 continue; 417 } 418 419 rtl::OStringBuffer aOutStream( i_pOutFile ); 420 aOutStream.append( "_stream_" ); 421 aOutStream.append( nObject ); 422 aOutStream.append( "_" ); 423 aOutStream.append( nGeneration ); 424 FileEmitContext aContext( aOutStream.getStr(), i_pInFile, i_pPDFFile ); 425 aContext.m_bDecrypt = i_pPDFFile->isEncrypted(); 426 pStream->writeStream( aContext, i_pPDFFile ); 427 } 428 return nRet; 429 } 430 431 SAL_IMPLEMENT_MAIN_WITH_ARGS( argc, argv ) 432 { 433 const char* pInFile = NULL; 434 const char* pOutFile = NULL; 435 const char* pPassword = NULL; 436 OStringBuffer aOutFile( 256 ); 437 PDFFileHdl aHdl = write_unzipFile; 438 439 for( int nArg = 1; nArg < argc; nArg++ ) 440 { 441 if( argv[nArg][0] == '-' ) 442 { 443 if( ! rtl_str_compare( "-pw", argv[nArg] ) || 444 ! rtl_str_compare( "--password" , argv[nArg] ) ) 445 { 446 if( nArg == argc-1 ) 447 { 448 fprintf( stderr, "no password given\n" ); 449 return 1; 450 } 451 nArg++; 452 pPassword = argv[nArg]; 453 } 454 else if( ! rtl_str_compare( "-h", argv[nArg] ) || 455 ! rtl_str_compare( "--help", argv[nArg] ) ) 456 { 457 printHelp( argv[0] ); 458 return 0; 459 } 460 else if( ! rtl_str_compare( "-a", argv[nArg] ) || 461 ! rtl_str_compare( "--extract-add-streams", argv[nArg] ) ) 462 { 463 aHdl = write_addStreams; 464 } 465 else if( ! rtl_str_compare( "-f", argv[nArg] ) || 466 ! rtl_str_compare( "--extract-fonts", argv[nArg] ) ) 467 { 468 aHdl = write_fonts; 469 } 470 else if( ! rtl_str_compare( "-o", argv[nArg] ) || 471 ! rtl_str_compare( "--extract-objects", argv[nArg] ) ) 472 { 473 aHdl = write_objects; 474 nArg++; 475 if( nArg < argc ) 476 { 477 rtl::OString aObjs( argv[nArg] ); 478 sal_Int32 nIndex = 0; 479 while( nIndex != -1 ) 480 { 481 rtl::OString aToken( aObjs.getToken( 0, ',', nIndex ) ); 482 sal_Int32 nObject = 0; 483 sal_Int32 nGeneration = 0; 484 sal_Int32 nGenIndex = 0; 485 nObject = aToken.getToken( 0, ':', nGenIndex ).toInt32(); 486 if( nGenIndex != -1 ) 487 nGeneration = aToken.getToken( 0, ':', nGenIndex ).toInt32(); 488 s_aEmitObjects.push_back( std::pair<sal_Int32,sal_Int32>(nObject,nGeneration) ); 489 } 490 } 491 } 492 else 493 { 494 fprintf( stderr, "unrecognized option \"%s\"\n", 495 argv[nArg] ); 496 printHelp( argv[0] ); 497 return 1; 498 } 499 } 500 else if( pInFile == NULL ) 501 pInFile = argv[nArg]; 502 else if( pOutFile == NULL ) 503 pOutFile = argv[nArg]; 504 } 505 if( ! pInFile ) 506 { 507 fprintf( stderr, "no input file given\n" ); 508 return 10; 509 } 510 if( ! pOutFile ) 511 { 512 OString aFile( pInFile ); 513 if( aFile.getLength() > 0 ) 514 { 515 if( aFile.getLength() > 4 ) 516 { 517 if( aFile.matchIgnoreAsciiCase( OString( ".pdf" ), aFile.getLength()-4 ) ) 518 aOutFile.append( pInFile, aFile.getLength() - 4 ); 519 else 520 aOutFile.append( aFile ); 521 } 522 aOutFile.append( "_unzip.pdf" ); 523 pOutFile = aOutFile.getStr(); 524 } 525 else 526 { 527 fprintf( stderr, "no output file given\n" ); 528 return 11; 529 } 530 } 531 532 return handleFile( pInFile, pOutFile, pPassword, aHdl ); 533 } 534 535