xref: /AOO41X/main/sw/source/filter/ascii/parasc.cxx (revision 69a743679e823ad8f875be547552acb607b8ada5)
1 /**************************************************************
2  *
3  * Licensed to the Apache Software Foundation (ASF) under one
4  * or more contributor license agreements.  See the NOTICE file
5  * distributed with this work for additional information
6  * regarding copyright ownership.  The ASF licenses this file
7  * to you under the Apache License, Version 2.0 (the
8  * "License"); you may not use this file except in compliance
9  * with the License.  You may obtain a copy of the License at
10  *
11  *   http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing,
14  * software distributed under the License is distributed on an
15  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16  * KIND, either express or implied.  See the License for the
17  * specific language governing permissions and limitations
18  * under the License.
19  *
20  *************************************************************/
21 
22 
23 
24 // MARKER(update_precomp.py): autogen include statement, do not remove
25 #include "precompiled_sw.hxx"
26 /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil -*- */
27 
28 
29 #include <tools/stream.hxx>
30 #include <hintids.hxx>
31 #include <rtl/tencinfo.h>
32 #include <sfx2/printer.hxx>
33 #include <editeng/fontitem.hxx>
34 #include <editeng/langitem.hxx>
35 #include <editeng/brkitem.hxx>
36 #include <editeng/scripttypeitem.hxx>
37 #include <shellio.hxx>
38 #include <doc.hxx>
39 #include <swtypes.hxx>
40 #include <ndtxt.hxx>
41 #include <pam.hxx>
42 #include <frmatr.hxx>
43 #include <fltini.hxx>
44 #include <pagedesc.hxx>
45 #include <breakit.hxx>
46 #include <swerror.h>
47 #ifndef _STATSTR_HRC
48 #include <statstr.hrc>          // ResId fuer Statusleiste
49 #endif
50 #include <mdiexp.hxx>           // ...Percent()
51 #include <poolfmt.hxx>
52 
53 #include "vcl/metric.hxx"
54 
55 #define ASC_BUFFLEN 4096
56 
57 class SwASCIIParser
58 {
59     SwDoc* pDoc;
60     SwPaM* pPam;
61     SvStream& rInput;
62     sal_Char* pArr;
63     const SwAsciiOptions& rOpt;
64     SfxItemSet* pItemSet;
65     long nFileSize;
66     sal_uInt16 nScript;
67     bool bNewDoc;
68 
69     sal_uLong ReadChars();
70     void InsertText( const String& rStr );
71 
72 public:
73     SwASCIIParser( SwDoc* pD, const SwPaM& rCrsr, SvStream& rIn,
74                             int bReadNewDoc, const SwAsciiOptions& rOpts );
75     ~SwASCIIParser();
76 
77     sal_uLong CallParser();
78 };
79 
80 
81 // Aufruf fuer die allg. Reader-Schnittstelle
Read(SwDoc & rDoc,const String &,SwPaM & rPam,const String &)82 sal_uLong AsciiReader::Read( SwDoc &rDoc, const String&, SwPaM &rPam, const String & )
83 {
84     if( !pStrm )
85     {
86         ASSERT( !this, "ASCII-Read ohne Stream" );
87         return ERR_SWG_READ_ERROR;
88     }
89 
90     //JP 18.01.96: Alle Ueberschriften sind normalerweise ohne
91     //              Kapitelnummer. Darum hier explizit abschalten
92     //              weil das Default jetzt wieder auf AN ist.
93     if( !bInsertMode )
94         Reader::SetNoOutlineNum( rDoc );
95 
96     SwASCIIParser* pParser = new SwASCIIParser( &rDoc, rPam, *pStrm,
97                                         !bInsertMode, aOpt.GetASCIIOpts() );
98     sal_uLong nRet = pParser->CallParser();
99 
100     delete pParser;
101     // after Read reset the options
102     aOpt.ResetASCIIOpts();
103     return nRet;
104 }
105 
SwASCIIParser(SwDoc * pD,const SwPaM & rCrsr,SvStream & rIn,int bReadNewDoc,const SwAsciiOptions & rOpts)106 SwASCIIParser::SwASCIIParser(SwDoc* pD, const SwPaM& rCrsr, SvStream& rIn,
107     int bReadNewDoc, const SwAsciiOptions& rOpts)
108     : pDoc(pD), rInput(rIn), rOpt(rOpts), nScript(0), bNewDoc(bReadNewDoc)
109 {
110     pPam = new SwPaM( *rCrsr.GetPoint() );
111     pArr = new sal_Char [ ASC_BUFFLEN + 2 ];
112 
113     pItemSet = new SfxItemSet( pDoc->GetAttrPool(),
114                 RES_CHRATR_FONT,        RES_CHRATR_LANGUAGE,
115                 RES_CHRATR_CJK_FONT,    RES_CHRATR_CJK_LANGUAGE,
116                 RES_CHRATR_CTL_FONT,    RES_CHRATR_CTL_LANGUAGE,
117                 0 );
118 
119     // set defaults from the options
120     if( rOpt.GetLanguage() )
121     {
122         SvxLanguageItem aLang( (LanguageType)rOpt.GetLanguage(),
123                                 RES_CHRATR_LANGUAGE );
124         pItemSet->Put( aLang );
125         pItemSet->Put( aLang, RES_CHRATR_CJK_LANGUAGE );
126         pItemSet->Put( aLang, RES_CHRATR_CTL_LANGUAGE );
127     }
128     if( rOpt.GetFontName().Len() )
129     {
130         Font aTextFont( rOpt.GetFontName(), Size( 0, 10 ) );
131         if( pDoc->getPrinter( false ) )
132             aTextFont = pDoc->getPrinter( false )->GetFontMetric( aTextFont );
133         SvxFontItem aFont( aTextFont.GetFamily(), aTextFont.GetName(),
134                            aEmptyStr, aTextFont.GetPitch(), aTextFont.GetCharSet(), RES_CHRATR_FONT );
135         pItemSet->Put( aFont );
136         pItemSet->Put( aFont, RES_CHRATR_CJK_FONT );
137         pItemSet->Put( aFont, RES_CHRATR_CTL_FONT );
138     }
139 }
140 
~SwASCIIParser()141 SwASCIIParser::~SwASCIIParser()
142 {
143     delete pPam;
144     delete [] pArr;
145     delete pItemSet;
146 }
147 
148 
149 // Aufruf des Parsers
CallParser()150 sal_uLong SwASCIIParser::CallParser()
151 {
152     rInput.Seek(STREAM_SEEK_TO_END);
153     rInput.ResetError();
154 
155     nFileSize = rInput.Tell();
156     rInput.Seek(STREAM_SEEK_TO_BEGIN);
157     rInput.ResetError();
158 
159     ::StartProgress( STR_STATSTR_W4WREAD, 0, nFileSize, pDoc->GetDocShell() );
160 
161     SwPaM* pInsPam = 0;
162     xub_StrLen nSttCntnt = 0;
163     if (!bNewDoc)
164     {
165         const SwNodeIndex& rTmp = pPam->GetPoint()->nNode;
166         pInsPam = new SwPaM( rTmp, rTmp, 0, -1 );
167         nSttCntnt = pPam->GetPoint()->nContent.GetIndex();
168     }
169 
170     SwTxtFmtColl *pColl = 0;
171 
172     if (bNewDoc)
173     {
174         pColl = pDoc->GetTxtCollFromPool(RES_POOLCOLL_HTML_PRE, false);
175         if (!pColl)
176             pColl = pDoc->GetTxtCollFromPool(RES_POOLCOLL_STANDARD,false);
177         if (pColl)
178             pDoc->SetTxtFmtColl(*pPam, pColl);
179     }
180 
181     sal_uLong nError = ReadChars();
182 
183     if( pItemSet )
184     {
185         // set only the attribute, for scanned scripts.
186         if( !( SCRIPTTYPE_LATIN & nScript ))
187         {
188             pItemSet->ClearItem( RES_CHRATR_FONT );
189             pItemSet->ClearItem( RES_CHRATR_LANGUAGE );
190         }
191         if( !( SCRIPTTYPE_ASIAN & nScript ))
192         {
193             pItemSet->ClearItem( RES_CHRATR_CJK_FONT );
194             pItemSet->ClearItem( RES_CHRATR_CJK_LANGUAGE );
195         }
196         if( !( SCRIPTTYPE_COMPLEX & nScript ))
197         {
198             pItemSet->ClearItem( RES_CHRATR_CTL_FONT );
199             pItemSet->ClearItem( RES_CHRATR_CTL_LANGUAGE );
200         }
201         if( pItemSet->Count() )
202         {
203             if( bNewDoc )
204             {
205                 if (pColl)
206                 {
207                     // Using the pool defaults for the font causes significant
208                     // trouble for the HTML filter, because it is not able
209                     // to export the pool defaults (or to be more precice:
210                     // the HTML filter is not able to detect whether a pool
211                     // default has changed or not. Even a comparison with the
212                     // HTMLi template does not work, because the defaults are
213                     // not copied when a new doc is created. The result of
214                     // comparing pool defaults therfor would be that the
215                     // defaults are exported always if the have changed for
216                     // text documents in general. That's not sensible, as well
217                     // as it is not sensible to export them always.
218                     sal_uInt16 aWhichIds[4] =
219                     {
220                         RES_CHRATR_FONT, RES_CHRATR_CJK_FONT,
221                         RES_CHRATR_CTL_FONT, 0
222                     };
223                     sal_uInt16 *pWhichIds = aWhichIds;
224                     while (*pWhichIds)
225                     {
226                         const SfxPoolItem *pItem;
227                         if (SFX_ITEM_SET == pItemSet->GetItemState(*pWhichIds,
228                             false, &pItem))
229                         {
230                             pColl->SetFmtAttr( *pItem );
231                             pItemSet->ClearItem( *pWhichIds );
232                         }
233                         ++pWhichIds;
234                     }
235                 }
236                 if (pItemSet->Count())
237                     pDoc->SetDefault(*pItemSet);
238             }
239             else if( pInsPam )
240             {
241                 // then set over the insert range the defined attributes
242                 *pInsPam->GetMark() = *pPam->GetPoint();
243                 pInsPam->GetPoint()->nNode++;
244                 pInsPam->GetPoint()->nContent.Assign(
245                                     pInsPam->GetCntntNode(), nSttCntnt );
246 
247                 // !!!!!
248                 ASSERT( !this, "Have to change - hard attr. to para. style" );
249                 pDoc->InsertItemSet( *pInsPam, *pItemSet, 0 );
250             }
251         }
252         delete pItemSet, pItemSet = 0;
253     }
254 
255     if( pInsPam )
256         delete pInsPam;
257 
258     ::EndProgress( pDoc->GetDocShell() );
259     return nError;
260 }
261 
ReadChars()262 sal_uLong SwASCIIParser::ReadChars()
263 {
264     sal_Unicode *pStt = 0, *pEnd = 0, *pLastStt = 0;
265     long nReadCnt = 0, nLineLen = 0;
266     sal_Unicode cLastCR = 0;
267     bool bSwapUnicode = false;
268 
269     const SwAsciiOptions *pUseMe=&rOpt;
270     SwAsciiOptions aEmpty;
271     if (nFileSize >= 2 &&
272         aEmpty.GetFontName() == rOpt.GetFontName() &&
273         aEmpty.GetCharSet() == rOpt.GetCharSet() &&
274         aEmpty.GetLanguage() == rOpt.GetLanguage() &&
275         aEmpty.GetParaFlags() == rOpt.GetParaFlags())
276     {
277         sal_uLong nLen, nOrig;
278         nOrig = nLen = rInput.Read(pArr, ASC_BUFFLEN);
279         CharSet eCharSet;
280         bool bRet = SwIoSystem::IsDetectableText(pArr, nLen, &eCharSet, &bSwapUnicode);
281         ASSERT(bRet, "Autodetect of text import without nag dialog must "
282             "have failed");
283         if (bRet && eCharSet != RTL_TEXTENCODING_DONTKNOW)
284         {
285             aEmpty.SetCharSet(eCharSet);
286             rInput.SeekRel(-(long(nLen)));
287         }
288         else
289             rInput.SeekRel(-(long(nOrig)));
290         pUseMe=&aEmpty;
291     }
292 
293     rtl_TextToUnicodeConverter hConverter=0;
294     rtl_TextToUnicodeContext hContext=0;
295     CharSet currentCharSet = pUseMe->GetCharSet();
296     if (RTL_TEXTENCODING_UCS2 != currentCharSet)
297     {
298         if( currentCharSet == RTL_TEXTENCODING_DONTKNOW )
299                 currentCharSet = RTL_TEXTENCODING_ASCII_US;
300         hConverter = rtl_createTextToUnicodeConverter( currentCharSet );
301         ASSERT( hConverter, "no string convert avaiable" );
302         if (!hConverter)
303             return ERROR_SW_READ_BASE;
304         bSwapUnicode = false;
305         hContext = rtl_createTextToUnicodeContext( hConverter );
306     }
307     else if (pUseMe != &aEmpty)  //Already successfully figured out type
308     {
309         rInput.StartReadingUnicodeText( currentCharSet );
310         bSwapUnicode = rInput.IsEndianSwap();
311     }
312 
313     String sWork;
314     sal_uLong nArrOffset = 0;
315 
316     do {
317         if( pStt >= pEnd )
318         {
319             if( pLastStt != pStt )
320                 InsertText( String( pLastStt ));
321 
322             // lese einen neuen Block ein
323             sal_uLong lGCount;
324             if( SVSTREAM_OK != rInput.GetError() || 0 == (lGCount =
325                         rInput.Read( pArr + nArrOffset,
326                                      ASC_BUFFLEN - nArrOffset )))
327                 break;      // aus der WHILE-Schleife heraus
328 
329             /*
330             #98380#
331             If there was some unconverted bytes on the last cycle then they
332             were put at the beginning of the array, so total bytes available
333             to convert this cycle includes them. If we found 0 following bytes
334             then we ignore the previous partial character.
335             */
336             lGCount+=nArrOffset;
337 
338             if( hConverter )
339             {
340                 sal_uInt32 nInfo;
341                 sal_Size nNewLen = lGCount, nCntBytes;
342                 sal_Unicode* pBuf = sWork.AllocBuffer( static_cast< xub_StrLen >(nNewLen) );
343 
344                 nNewLen = rtl_convertTextToUnicode( hConverter, hContext,
345                                 pArr, lGCount, pBuf, nNewLen,
346                                 (
347                                 RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_DEFAULT |
348                                 RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_DEFAULT |
349                                 RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT |
350                                 RTL_TEXTTOUNICODE_FLAGS_GLOBAL_SIGNATURE
351                                 ),
352                                 &nInfo,
353                                 &nCntBytes );
354                 if( 0 != ( nArrOffset = lGCount - nCntBytes ) )
355                     memmove( pArr, pArr + nCntBytes, nArrOffset );
356                 sWork.ReleaseBufferAccess( static_cast< xub_StrLen >(nNewLen) );
357 
358                 pStt = pLastStt = sWork.GetBufferAccess();
359                 pEnd = pStt + nNewLen;
360             }
361             else
362             {
363                 pStt = pLastStt = (sal_Unicode*)pArr;
364                 pEnd = (sal_Unicode*)(pArr + lGCount);
365 
366                 if( bSwapUnicode )
367                 {
368                     sal_Char* pF = pArr, *pN = pArr + 1;
369                     for( sal_uLong n = 0; n < lGCount; n += 2, pF += 2, pN += 2 )
370                     {
371                         sal_Char c = *pF;
372                         *pF = *pN;
373                         *pN = c;
374                     }
375                 }
376             }
377 
378             *pEnd = 0;
379             nReadCnt += lGCount;
380 
381             ::SetProgressState( nReadCnt, pDoc->GetDocShell() );
382 
383             if( cLastCR )
384             {
385                 if( 0x0a == *pStt && 0x0d == cLastCR )
386                     pLastStt = ++pStt;
387                 cLastCR = 0;
388                 nLineLen = 0;
389                 // JP 03.04.96: das letze am Ende nehmen wir nicht
390                 if( !rInput.IsEof() || !(pEnd == pStt ||
391                     ( !*pEnd && pEnd == pStt+1 ) ) )
392                     pDoc->SplitNode( *pPam->GetPoint(), false );
393             }
394         }
395 
396         bool bIns = true, bSplitNode = false;
397         switch( *pStt )
398         {
399 //JP 12.11.2001: task 94636 - don't ignore all behind the zero character,
400 //                            change it to the default "control character"
401 //      case 0:
402 //                  pEnd = pStt;
403 //                  bIns = false ;
404 //                  break;
405 
406         case 0x0a:  if( LINEEND_LF == pUseMe->GetParaFlags() )
407                     {
408                         bIns = false;
409                         *pStt = 0;
410                         ++pStt;
411 
412                         // JP 03.04.96: das letze am Ende nehmen wir nicht
413                         if( !rInput.IsEof() || pEnd != pStt )
414                             bSplitNode = true;
415                     }
416                     break;
417 
418         case 0x0d:  if( LINEEND_LF != pUseMe->GetParaFlags() )
419                     {
420                         bIns = false;
421                         *pStt = 0;
422                         ++pStt;
423 
424                         bool bChkSplit = false;
425                         if( LINEEND_CRLF == pUseMe->GetParaFlags() )
426                         {
427                             if( pStt == pEnd )
428                                 cLastCR = 0x0d;
429                             else if( 0x0a == *pStt )
430                             {
431                                 ++pStt;
432                                 bChkSplit = true;
433                             }
434                         }
435                         else
436                             bChkSplit = true;
437 
438                             // JP 03.04.96: das letze am Ende nehmen wir nicht
439                         if( bChkSplit && ( !rInput.IsEof() || pEnd != pStt ))
440                             bSplitNode = true;
441                     }
442                     break;
443 
444         case 0x0c:
445                     {
446                         // dann mal einen harten Seitenumbruch einfuegen
447                         *pStt++ = 0;
448                         if( nLineLen )
449                         {
450                             // Change to charset system!!!!
451                             //rOpt.GetCharSet();
452                             InsertText( String( pLastStt ));
453                         }
454                         pDoc->SplitNode( *pPam->GetPoint(), false );
455                         pDoc->InsertPoolItem(
456                             *pPam, SvxFmtBreakItem( SVX_BREAK_PAGE_BEFORE, RES_BREAK ), 0);
457                         pLastStt = pStt;
458                         nLineLen = 0;
459                         bIns = false;
460                     }
461                     break;
462 
463         case 0x1a:
464                     if( nReadCnt == nFileSize && pStt+1 == pEnd )
465                         *pStt = 0;
466                     else
467                         *pStt = '#';        // Ersatzdarstellung
468                     break;
469 
470         case '\t':  break;
471 
472         default:
473             if( ' ' > *pStt )
474                     // Ctrl-Zchn gefunden ersetze durch '#'
475                 *pStt = '#';
476             break;
477         }
478 
479         if( bIns )
480         {
481             if( ( nLineLen >= MAX_ASCII_PARA - 100 ) &&
482                 ( ( *pStt == ' ' ) || ( nLineLen >= MAX_ASCII_PARA - 1 ) ) )
483             {
484                 sal_Unicode c = *pStt;
485                 *pStt = 0;
486                 InsertText( String( pLastStt ));
487                 pDoc->SplitNode( *pPam->GetPoint(), false );
488                 pLastStt = pStt;
489                 nLineLen = 0;
490                 *pStt = c;
491             }
492             ++pStt;
493             ++nLineLen;
494         }
495         else if( bSplitNode )
496         {
497             // es wurde ein CR/LF erkannt, also speichere den Text
498 
499             InsertText( String( pLastStt ));
500             pDoc->SplitNode( *pPam->GetPoint(), false );
501             pLastStt = pStt;
502             nLineLen = 0;
503         }
504     } while(true);
505 
506     if( hConverter )
507     {
508         rtl_destroyTextToUnicodeContext( hConverter, hContext );
509         rtl_destroyTextToUnicodeConverter( hConverter );
510     }
511     return 0;
512 }
513 
InsertText(const String & rStr)514 void SwASCIIParser::InsertText( const String& rStr )
515 {
516     pDoc->InsertString( *pPam, rStr );
517     if( pItemSet && pBreakIt && nScript != ( SCRIPTTYPE_LATIN |
518                                              SCRIPTTYPE_ASIAN |
519                                              SCRIPTTYPE_COMPLEX ) )
520         nScript |= pBreakIt->GetAllScriptsOfText( rStr );
521 }
522 
523 /* vi:set tabstop=4 shiftwidth=4 expandtab: */
524