1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31 __author__ = """Ronaldo Amaral Santos <ronaldinho.as@gmail.com>"""
32 __docformat__ = 'plaintext'
33
34 from xmlrpclib import *
35 from xml.dom.minidom import parseString
36 import copy
37 from StringIO import StringIO
38
39 from GranularUtils import Grain
40
41 import zipfile, base64, re
42 import os, sys, time, random
43 import shutil
44
45 import PIL.Image
46 import commands
47
48 import mimetypes
49
51
52
54 """
55 - Provide the grain extraction functionality for ms-office and odf documents
56 - Retrieve tables, images, thumbnails and summary
57 """
58 Document = None
59 __parseContent = None
60 __zipFile = None
61 __ooodServer = None
62 supportedMimeType=('application/vnd.oasis.opendocument.text',
63 'application/vnd.sun.xml.writer',
64 'application/msword',
65 'application/rtf',
66 'application/vnd.stardivision.writer',
67 'application/x-starwriter',
68 'text/plain',
69 'application/vnd.oasis.opendocument.spreadsheet',
70 'application/vnd.sun.xml.calc',
71 'application/vnd.ms-excel',
72 'application/vnd.stardivision.calc',
73 'application/x-starcalc',
74 'application/vnd.oasis.opendocument.presentation',
75 'application/vnd.sun.xml.impress',
76 'application/vnd.ms-powerpoint',
77 'application/vnd.stardivision.draw',
78 'application/vnd.stardivision.impress',
79 'application/x-starimpress',)
80
81 supportedGranulateMimeTypes=('application/vnd.oasis.opendocument.text',
82 'application/vnd.oasis.opendocument.presentation',)
83
84 supportedConvertionMimeTypes=('application/msword',
85 'application/rtf',
86 'application/vnd.ms-powerpoint',)
87
88
89
90
91
92 - def __init__(self, Document=None, ooodServer=None):
107
108
109
112
113
114
115
116
118 """
119 Create a connection to the OpenOffice(oood-ERP5) Server
120 """
121 try:
122 if self.__ooodServer is not None:
123 return ServerProxy(self.__ooodServer)
124 else:
125 raise ConectionServerError, "It was not possible to connect. oood Server not found "
126 except:
127 raise ConectionServerError, "It was not possible to connect to the Convertion Server."
128
130 """
131 Creates a new odt document based in a blank template
132 """
133 templatePath = os.path.join(os.path.dirname(__file__), 'template', 'template.odt')
134 template_str=open(templatePath).read()
135 return template_str
136
137 - def __getNodeText(self,node):
138 """
139 Get text value in a xml node
140 """
141 text = ''
142 for child in node.childNodes:
143 if child.nodeType is child.TEXT_NODE:
144 text += child.data
145 return text
146
147 - def __getTextChildNodesImage(self,node,text=[]):
148 """
149 Get the subtitle text of image in odf document
150 """
151 if node.nextSibling:
152 node = node.nextSibling
153 if node.nodeType is node.TEXT_NODE:
154 text.append(node.data)
155 else:
156 text.append(self.__getNodeText(node))
157 return self.__getTextChildNodesImage(node,text)
158 else:
159 return text
160
161 - def __getTextChildNodesTable(self,node,text=[]):
162 """
163 Get the subtitle text of a table in odf document
164 """
165 for n in node.childNodes:
166 if n.nodeType is n.TEXT_NODE:
167 text.append(n.data)
168 if n.hasChildNodes():
169 self.__getTextChildNodesTable(n,text)
170 return text
171
173 """
174 Get the associated Styles of given node
175 """
176 if Node.attributes is not None:
177 for i in Node.attributes.keys():
178 if re.search("^.+\:style-name$",i):
179 if Node.getAttribute(i):
180 return Node.getAttribute(i)
181
183 style=self.__getAttrStyles(Node)
184 if style:
185 styles.append(style)
186 for i in Node.childNodes:
187 self.__getAttributesR(i,styles)
188 if styles:
189 return styles
190
192 """
193 Convert a ms-office document to Open Document Format (odf)
194 """
195 sp = self.__mkServer()
196 res = sp.convert(self.Document.getFilename(), base64.encodestring(self.Document.getData().getvalue()))
197 if res[0]==200:
198 file=StringIO(base64.decodestring(res[1]['data']))
199 return file
200 else:
201 return None
202
203
205 """
206 Get the Summary of an odf document
207 """
208 title_elements = self.__parseContent.getElementsByTagName('text:h')
209 titles = []
210 for t in title_elements:
211 level = int(t.attributes['text:outline-level'].value)
212 title = self.__getNodeText(t)
213 titles.append({'level':level, 'value':title})
214 if titles:
215 return titles
216 else:
217 return None
218
219
221 """
222 Get the Thumbnails of an odf document
223 """
224 for f in self.__zipFile.infolist():
225 if f.filename == 'Thumbnails/thumbnail.png':
226 contents = self.__zipFile.read('Thumbnails/thumbnail.png')
227 return StringIO(contents)
228
229 return None
230
232 """
233 Uncompress an odf file and parse the "content.xml" file.
234 """
235 try:
236 self.__zipFile = zipfile.PyZipFile(self.Document.getData(),'r')
237 except zipfile.BadZipfile, e:
238
239 return None, None
240
241 contents = self.__zipFile.read('content.xml')
242 self.__parseContent = parseString(contents)
243
244
246 """
247 Extract the tables from a document and return a list of Grain instances
248 """
249 table_list=[]
250
251 template_str=self.__createNewOOoDocument()
252 tables= self.__parseContent.getElementsByTagName('table:table')
253 stylesDoc= self.__parseContent.getElementsByTagName('style:style')
254 for t in tables:
255 styles = self.__getAttributesR(t)
256 table_name = t.getAttribute('table:name')
257 imgHrefs=[]
258 for img in t.getElementsByTagName("draw:image"):
259 if img.hasAttribute("xlink:href"):
260 path=img.getAttribute('xlink:href')
261
262 if "ObjectReplacements" in path:
263
264 imgHrefs.append(path.replace("./",""))
265
266 elif re.match("^http://.+[\.jpg|\.png|\.gif]$",path):
267 continue
268 else:
269 imgHrefs.append(path)
270
271
272 objGran = Grain()
273 leg=[]
274 p = t.previousSibling
275 n = t.nextSibling
276 if p is not None:
277 if p.hasChildNodes():
278 legenda = ''
279 for i in self.__getTextChildNodesTable(p,text=[]):
280 legenda+=i
281 leg.append(legenda)
282 else:
283 leg.append(self.__getNodeText(p))
284 if n is not None:
285 if n.hasChildNodes():
286 legenda = ''
287 for j in self.__getTextChildNodesTable(n,text=[]):
288 legenda+=j
289 leg.append(legenda)
290 else:
291 leg.append(self.__getNodeText(n))
292
293
294 caption = ' '.join([ i for i in leg if i is not None])
295
296 objGran.setCaption(caption)
297
298 table_name = t.getAttribute('table:name')
299 new_table = StringIO()
300 new_table.write(template_str)
301 template_odt = zipfile.PyZipFile(new_table,'a')
302 doc = parseString(template_odt.read('content.xml'))
303 office_text=doc.getElementsByTagName('office:text')
304 office_text=office_text[0]
305
306
307 newTableNo=doc.importNode(t,True)
308 office_text.appendChild(newTableNo)
309
310 for sty in stylesDoc:
311 if (sty.getAttribute('style:name') in styles):
312 office_automatic_styles=doc.getElementsByTagName('office:automatic-styles')
313 office_automatic_styles=office_automatic_styles[0]
314 office_automatic_styles.appendChild(doc.importNode(sty,True))
315 if imgHrefs:
316 for image in imgHrefs:
317 template_odt.writestr(str(image),self.__zipFile.read(image))
318 template_odt.writestr('content.xml',doc.toxml().encode('utf-8'))
319 template_odt.close()
320 if table_name:
321
322 objGran.setId(table_name)
323 objGran.setContent(new_table)
324 table_list.append(objGran)
325 if table_list:
326 return table_list
327 else:
328 return None
329
330
331
333 """
334 Extract the images from a document and return a list of Grain instances
335 """
336 image_list=[]
337
338 tag_images = self.__parseContent.getElementsByTagName('draw:image')
339
340 if len(tag_images):
341 for item in tag_images:
342 name=None
343 if item.hasAttribute("xlink:href"):
344 path=item.getAttribute('xlink:href')
345
346 if "Pictures" in path:
347
348 name=path.replace("Pictures/","")
349 elif "ObjectReplacements" in path:
350 name=path.replace("./ObjectReplacements/","")
351
352 path = path.replace("./","")
353
354
355 elif re.match("^http://.+[\.jpg|\.png|\.gif]$",path):
356 continue
357
358 if name is not None:
359
360 f, e = os.path.splitext(name)
361 if e.lower() in ['.png','.gif','.jpg']:
362
363 if not name in [image.getId() for image in image_list]:
364 parent = item.parentNode
365 nChild = parent.nextSibling
366 objGran = Grain()
367 if nChild:
368 text=[]
369 caption = ''
370 if nChild.nodeType is nChild.TEXT_NODE:
371 text.append(nChild.data)
372 for t in self.__getTextChildNodesImage(nChild,text):
373 if t is not None: caption+=t
374 objGran.setCaption(caption)
375 imagefile = StringIO(self.__zipFile.read(path))
376 objGran.setId(name)
377 objGran.setContent(imagefile)
378 image_list.append(objGran)
379 if image_list:
380 return image_list
381 else:
382 return None
383
384
385
386
395
404
406 """
407 Invoke the private method __getImageDocumentList in order to retrieve the document's images
408 """
409 if self.__zipFile is not None:
410 return self.__getImageDocumentList()
411 else:
412 return None
413
415 """
416 Invoke the private method __getTableDocumentList in order to retrieve the document's tables
417 """
418 if self.__zipFile is not None:
419 return self.__getTableDocumentList()
420 else:
421 return None
422
424 """
425 Extract the grains from a document, returning a dictionary with a list of tables and a list of images
426 """
427 returnfiles = {}
428 if self.__zipFile is not None:
429 returnfiles['image_list'] = self.__getImageDocumentList()
430 returnfiles['table_list'] = self.__getTableDocumentList()
431 return returnfiles
432
433 else:
434 return None
435