Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

# -*- coding: UTF-8 -*- 

# Copyright 2011-2016 Luc Saffre 

# License: BSD (see file COPYING for details) 

 

"""This module contains mainly a utility function :func:`html2odf` 

which converts an ElementTree object generated using 

:mod:`lino.utils.xmlgen.html` to a fragment of ODF. 

 

.. This is part of the Lino test suite. To test it individually, run: 

 

    $ python lino/utils/html2odf.py 

 

This is not trivial. The challenge is that HTML and ODF are quite 

different document representations. But something like this seems 

necessary. Lino uses it in order to generate .odt documents which 

contain (among other) chunks of html that have been entered using 

TinyMCE and stored in database fields. 

 

TODO: is there really no existing library for this task? I saw 

approaches which call libreoffice in headless mode to do the 

conversion, but this sounds inappropriate for our situation where we 

must glue together fragments from different sources. Also note that we 

use :mod:`appy.pod` to do the actual generation. 

 

Usage examples: 

 

>>> from lino.utils.xmlgen.html import E 

>>> def test(e): 

...     print E.tostring(e) 

...     print toxml(html2odf(e)) 

>>> test(E.p("This is a ", E.b("first"), " test.")) 

... #doctest: +NORMALIZE_WHITESPACE 

<p>This is a <b>first</b> test.</p> 

<text:p xmlns:text="urn:oasis:names:tc:opendocument:xmlns:text:1.0">This 

is a <text:span text:style-name="Strong Emphasis">first</text:span> 

test.</text:p> 

 

>>> test(E.p(E.b("This")," is another test.")) 

... #doctest: +NORMALIZE_WHITESPACE 

<p><b>This</b> is another test.</p> 

<text:p xmlns:text="urn:oasis:names:tc:opendocument:xmlns:text:1.0"><text:span 

text:style-name="Strong Emphasis">This</text:span> is another test.</text:p> 

 

>>> test(E.p(E.i("This")," is another test.")) 

... #doctest: +NORMALIZE_WHITESPACE 

<p><i>This</i> is another test.</p> 

<text:p xmlns:text="urn:oasis:names:tc:opendocument:xmlns:text:1.0"><text:span 

text:style-name="Emphasis">This</text:span> is another test.</text:p> 

 

>>> test(E.td(E.p("This is another test."))) 

... #doctest: +NORMALIZE_WHITESPACE 

<td><p>This is another test.</p></td> 

<text:p xmlns:text="urn:oasis:names:tc:opendocument:xmlns:text:1.0">This 

is another test.</text:p> 

 

>>> test(E.td(E.p(E.b("This"), " is another test."))) 

... #doctest: +NORMALIZE_WHITESPACE 

<td><p><b>This</b> is another test.</p></td> 

<text:p xmlns:text="urn:oasis:names:tc:opendocument:xmlns:text:1.0"><text:span 

text:style-name="Strong Emphasis">This</text:span> is another test.</text:p> 

 

>>> test(E.ul(E.li("First item"),E.li("Second item"))) #doctest: +NORMALIZE_WHITESPACE 

<ul><li>First item</li><li>Second item</li></ul> 

<text:list xmlns:text="urn:oasis:names:tc:opendocument:xmlns:text:1.0"  

text:style-name="podBulletedList"><text:list-item><text:p  

text:style-name="podBulletItem">First item</text:p></text:list-item><text:list-item><text:p  

text:style-name="podBulletItem">Second item</text:p></text:list-item></text:list> 

 

N.B.: the above chunk is obviously not correct since Writer doesn't display it. 

(How can I debug a generated odt file?  

I mean if my content.xml is syntactically valid but Writer ...) 

Idea: validate it against the ODF specification using lxml 

 

 

 

:func:`html2odf` converts bold text to a span with a  

style named "Strong Emphasis". That's currently a hard-coded name, and the  

caller must make sure that a style of that name is defined in the  

document. 

 

The text formats `<i>` and `<em>` are converted to a style "Emphasis". 

 

 

Edge case: 

 

>>> print toxml(html2odf("Plain string")) 

<text:p xmlns:text="urn:oasis:names:tc:opendocument:xmlns:text:1.0">Plain string</text:p> 

 

>>> print toxml(html2odf(u"Ein schöner Text")) 

<text:p xmlns:text="urn:oasis:names:tc:opendocument:xmlns:text:1.0">Ein schöner Text</text:p> 

 

Not yet supported 

================= 

 

The following is an example for :ticket:`788`. Conversion fails if a 

sequence of paragraph-level items are grouped using a div: 

 

>>> test(E.div(E.p("Two numbered items:"), 

...    E.ol(E.li("first"), E.li("second")))) 

... #doctest: +NORMALIZE_WHITESPACE 

Traceback (most recent call last): 

... 

IllegalText: The <text:section> element does not allow text 

 

 

>>> test(E.raw('<ul type="disc"><li>First</li><li>Second</li></ul>')) 

<ul type="disc"><li>First</li><li>Second</li></ul> 

<text:list xmlns:text="urn:oasis:names:tc:opendocument:xmlns:text:1.0" text:style-name="podBulletedList"><text:list-item><text:p text:style-name="podBulletItem">First</text:p></text:list-item><text:list-item><text:p text:style-name="podBulletItem">Second</text:p></text:list-item></text:list> 

 

""" 

 

from __future__ import unicode_literals 

from future import standard_library 

standard_library.install_aliases() 

from builtins import str 

from past.builtins import basestring 

 

import logging 

logger = logging.getLogger(__name__) 

 

from io import StringIO 

 

 

def toxml(node): 

    """Convert an ODF node to a string with its XML representation.""" 

    buf = StringIO() 

    node.toXml(0, buf) 

    return buf.getvalue() 

 

 

from odf import text 

 

 

#~ PTAGS = ('p','td','li') 

PTAGS = ('p', 'td', 'div', 'table', 'tr') 

 

 

def html2odf(e, ct=None, **ctargs): 

    """ 

    Convert a :mod:`lino.utils.xmlgen.html` element to an ODF text element. 

    Most formats are not implemented. 

    There's probably a better way to do this... 

 

    :ct: the root element ("container"). If not specified, we create one. 

 

    """ 

    sections_counter = 1 

    #~ print "20120613 html2odf()", e.tag, e.text 

    if ct is None: 

        ct = text.P(**ctargs) 

        #~ if e.tag in PTAGS: 

            #~ oe = text.P(**ctargs) 

        #~ else: 

            #~ oe = text.P(**ctargs) 

            #~ logger.info("20130201 %s",E.tostring(e)) 

            #~ raise NotImplementedError("<%s> without container" % e.tag) 

    if isinstance(e, basestring): 

        ct.addText(e) 

        #~ oe = text.Span() 

        #~ oe.addText(e) 

        #~ yield oe 

        return ct 

 

    if e.tag == 'ul': 

        ct = text.List(stylename='podBulletedList') 

        ctargs = dict(stylename='podBulletItem') 

        #~ ctargs = dict() 

 

    text_container = None 

 

    if e.tag == 'b': 

        #~ oe = text.Span(stylename='Bold Text') 

        oe = text.Span(stylename='Strong Emphasis') 

    elif e.tag == 'a': 

        oe = text.Span(stylename='Strong Emphasis') 

        #~ oe = text.Span(stylename='Bold Text') 

    elif e.tag in ('i', 'em'): 

        oe = text.Span(stylename='Emphasis') 

    elif e.tag == 'span': 

        oe = text.Span() 

    elif e.tag == 'br': 

        oe = text.LineBreak() 

 

    elif e.tag == 'h1': 

        """ 

        <text:h text:style-name="Heading_20_1" text:outline-level="1"> 

        """ 

        oe = ct = text.H(stylename="Heading 1", outlinelevel=1) 

    elif e.tag == 'h2': 

        oe = ct = text.H(stylename="Heading 2", outlinelevel=2) 

    elif e.tag == 'h3': 

        oe = ct = text.H(stylename="Heading 3", outlinelevel=3) 

    elif e.tag == 'div': 

        oe = ct = text.Section(name="S" + str(sections_counter)) 

 

    elif e.tag == 'img': 

        return  # ignore images 

    elif e.tag == 'ul': 

        oe = ct 

    #~ elif e.tag in ('ul','ol'): 

        #~ oe = text.List(stylename=e.tag.upper()) 

        #~ ctargs = dict(stylename=e.tag.upper()+"_P") 

    elif e.tag == 'li': 

        #~ oe = ct 

        oe = text.ListItem() 

        text_container = text.P(**ctargs) 

        oe.appendChild(text_container) 

 

    elif e.tag in PTAGS: 

        oe = ct 

        #~ if ct.tagName == 'p': 

            #~ oe = ct 

        #~ else: 

            #~ oe = text.P(**ctargs) 

    else: 

        #~ logger.info("20130201 %s",E.tostring(e)) 

        raise NotImplementedError("<%s> inside <%s>" % (e.tag, ct.tagName)) 

        #~ oe = text.Span() 

 

    if text_container is None: 

        text_container = oe 

    if e.text: 

        text_container.addText(e.text) 

    for child in e: 

        #~ html2odf(child,oe) 

        html2odf(child, text_container, **ctargs) 

        #~ for oc in html2odf(child,oe): 

            # ~ # oe.addElement(oc) 

            #~ oe.appendChild(oc) 

    #~ if not True: 

        #~ if e.tail: 

            #~ oe.addText(e.tail) 

    if oe is not ct: 

        ct.appendChild(oe) 

        #~ yield oe 

    #~ if True: 

    if e.tail: 

        #~ yield e.tail 

        #~ yield text.Span(text=e.tail) 

        #~ yield Text(e.tail) 

        ct.addText(e.tail) 

    return ct 

 

 

def _test(): 

    import doctest 

    doctest.testmod() 

 

if __name__ == "__main__": 

    _test()