1 /************************************************************** 2 * 3 * Licensed to the Apache Software Foundation (ASF) under one 4 * or more contributor license agreements. See the NOTICE file 5 * distributed with this work for additional information 6 * regarding copyright ownership. The ASF licenses this file 7 * to you under the Apache License, Version 2.0 (the 8 * "License"); you may not use this file except in compliance 9 * with the License. You may obtain a copy of the License at 10 * 11 * http://www.apache.org/licenses/LICENSE-2.0 12 * 13 * Unless required by applicable law or agreed to in writing, 14 * software distributed under the License is distributed on an 15 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 * KIND, either express or implied. See the License for the 17 * specific language governing permissions and limitations 18 * under the License. 19 * 20 *************************************************************/ 21 22 #include "HtmlFmtFlt.hxx" 23 24 #include <rtl/string.h> 25 26 #include <string> 27 #include <sstream> 28 #include <vector> 29 #include <iomanip> 30 31 #include <boost/assert.hpp> 32 33 using namespace com::sun::star::uno; 34 35 //------------------------------------------------------------------------------ 36 // converts the openoffice text/html clipboard format to the HTML Format 37 // well known under MS Windows 38 // the MS HTML Format has a header before the real html data 39 // 40 // Version:1.0 Version number of the clipboard. Staring is 0.9 41 // StartHTML: Byte count from the beginning of the clipboard to the start 42 // of the context, or -1 if no context 43 // EndHTML: Byte count from the beginning of the clipboard to the end 44 // of the context, or -1 if no context 45 // StartFragment: Byte count from the beginning of the clipboard to the 46 // start of the fragment 47 // EndFragment: Byte count from the beginning of the clipboard to the 48 // end of the fragment 49 // StartSelection: Byte count from the beginning of the clipboard to the 50 // start of the selection 51 // EndSelection: Byte count from the beginning of the clipboard to the 52 // end of the selection 53 // 54 // StartSelection and EndSelection are optional 55 // The fragment should be preceded and followed by the HTML comments 56 // <!--StartFragment--> and <!--EndFragment--> (no space between !-- and the 57 // text 58 //------------------------------------------------------------------------------ 59 60 namespace // private 61 { 62 std::string GetHtmlFormatHeader(size_t startHtml, size_t endHtml, size_t startFragment, size_t endFragment) 63 { 64 std::ostringstream htmlHeader; 65 htmlHeader << "Version:1.0" << '\r' << '\n'; 66 htmlHeader << "StartHTML:" << std::setw(10) << std::setfill('0') << std::dec << startHtml << '\r' << '\n'; 67 htmlHeader << "EndHTML:" << std::setw(10) << std::setfill('0') << std::dec << endHtml << '\r' << '\n'; 68 htmlHeader << "StartFragment:" << std::setw(10) << std::setfill('0') << std::dec << startFragment << '\r' << '\n'; 69 htmlHeader << "EndFragment:" << std::setw(10) << std::setfill('0') << std::dec << endFragment << '\r' << '\n'; 70 return htmlHeader.str(); 71 } 72 73 } // namespace private 74 75 76 // the office allways writes the start and end html tag in upper cases and 77 // without spaces both tags don't allow parameters 78 const std::string TAG_HTML = std::string("<HTML>"); 79 const std::string TAG_END_HTML = std::string("</HTML>"); 80 81 // The body tag may have parameters so we need to search for the 82 // closing '>' manually e.g. <BODY param> #92840# 83 const std::string TAG_BODY = std::string("<BODY"); 84 const std::string TAG_END_BODY = std::string("</BODY"); 85 86 Sequence<sal_Int8> SAL_CALL TextHtmlToHTMLFormat(Sequence<sal_Int8>& aTextHtml) 87 { 88 OSL_ASSERT(aTextHtml.getLength() > 0); 89 90 if (!(aTextHtml.getLength() > 0)) 91 return Sequence<sal_Int8>(); 92 93 // fill the buffer with dummy values to calc the exact length 94 std::string dummyHtmlHeader = GetHtmlFormatHeader(0, 0, 0, 0); 95 size_t lHtmlFormatHeader = dummyHtmlHeader.length(); 96 97 std::string textHtml( 98 reinterpret_cast<const sal_Char*>(aTextHtml.getConstArray()), 99 reinterpret_cast<const sal_Char*>(aTextHtml.getConstArray()) + aTextHtml.getLength()); 100 101 std::string::size_type nStartHtml = textHtml.find(TAG_HTML) + lHtmlFormatHeader - 1; // we start one before '<HTML>' Word 2000 does also so 102 std::string::size_type nEndHtml = textHtml.find(TAG_END_HTML) + lHtmlFormatHeader + TAG_END_HTML.length() + 1; // our SOffice 5.2 wants 2 behind </HTML>? 103 104 // The body tag may have parameters so we need to search for the 105 // closing '>' manually e.g. <BODY param> #92840# 106 std::string::size_type nStartFragment = textHtml.find(">", textHtml.find(TAG_BODY)) + lHtmlFormatHeader + 1; 107 std::string::size_type nEndFragment = textHtml.find(TAG_END_BODY) + lHtmlFormatHeader; 108 109 std::string htmlFormat = GetHtmlFormatHeader(nStartHtml, nEndHtml, nStartFragment, nEndFragment); 110 htmlFormat += textHtml; 111 112 Sequence<sal_Int8> byteSequence(htmlFormat.length() + 1); // space the trailing '\0' 113 rtl_zeroMemory(byteSequence.getArray(), byteSequence.getLength()); 114 115 rtl_copyMemory( 116 static_cast<void*>(byteSequence.getArray()), 117 static_cast<const void*>(htmlFormat.c_str()), 118 htmlFormat.length()); 119 120 return byteSequence; 121 } 122 123 const char* HtmlStartTag = "<html"; 124 125 Sequence<sal_Int8> HTMLFormatToTextHtml(const Sequence<sal_Int8>& aHTMLFormat) 126 { 127 BOOST_ASSERT(isHTMLFormat(aHTMLFormat) && "No HTML Format provided"); 128 129 Sequence<sal_Int8>& nonconstHTMLFormatRef = const_cast< Sequence<sal_Int8>& >(aHTMLFormat); 130 sal_Char* dataStart = reinterpret_cast<sal_Char*>(nonconstHTMLFormatRef.getArray()); 131 sal_Char* dataEnd = dataStart + nonconstHTMLFormatRef.getLength() - 1; 132 const sal_Char* htmlStartTag = strcasestr(dataStart, HtmlStartTag); 133 134 BOOST_ASSERT(htmlStartTag && "Seems to be no HTML at all"); 135 136 // It doesn't seem to be HTML? Well then simply return what has been 137 // provided in non-debug builds 138 if (htmlStartTag == NULL) 139 { 140 return aHTMLFormat; 141 } 142 143 sal_Int32 len = dataEnd - htmlStartTag; 144 Sequence<sal_Int8> plainHtmlData(len); 145 146 rtl_copyMemory(static_cast<void*>(plainHtmlData.getArray()), htmlStartTag, len); 147 148 return plainHtmlData; 149 } 150 151 /* A simple format detection. We are just comparing the first few bytes 152 of the provided byte sequence to see whether or not it is the MS 153 Office Html format. If it shows that this is not reliable enough we 154 can improve this 155 */ 156 const char HtmlFormatStart[] = "Version:"; 157 int HtmlFormatStartLen = (sizeof(HtmlFormatStart) - 1); 158 159 bool isHTMLFormat(const Sequence<sal_Int8>& aHtmlSequence) 160 { 161 if (aHtmlSequence.getLength() < HtmlFormatStartLen) 162 return false; 163 164 return rtl_str_compareIgnoreAsciiCase_WithLength(HtmlFormatStart, 165 HtmlFormatStartLen, 166 reinterpret_cast<const sal_Char*>(aHtmlSequence.getConstArray()), 167 HtmlFormatStartLen) == 0; 168 } 169