Source code for hemlock.clients.file_types.hpdf

#!/usr/bin/env python
#
#   Copyright (c) 2013 In-Q-Tel, Inc/Lab41, All Rights Reserved.
#
#   Licensed under the Apache License, Version 2.0 (the "License");
#   you may not use this file except in compliance with the License.
#   You may obtain a copy of the License at
#
#       http://www.apache.org/licenses/LICENSE-2.0
#
#   Unless required by applicable law or agreed to in writing, software
#   distributed under the License is distributed on an "AS IS" BASIS,
#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#   See the License for the specific language governing permissions and
#   limitations under the License.

from .. import hfs

from pdfminer.pdfinterp import PDFResourceManager, process_pdf
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from cStringIO import StringIO

import base64
import json

[docs]class Hpdf:
[docs] def process_files(self, debug, file, file_mime, h_server, client_uuid): # !! TODO try/catch f = open(file, 'rb') h_inst = hfs.HFs() # DEBUG if "pdf" in file_mime: # DEBUG try: text = self.convert_pdf(debug, file) j_str = json.dumps( { "payload" : text } ) except: b64_text = base64.b64encode(f.read()) j_str = json.dumps( { "payload": b64_text } ) j_list.append(j_str) h_inst.format_lists(debug, j_list, h_server, client_uuid) j_list = []
[docs] def convert_pdf(self, debug, input): # DEBUG rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = file(input, 'rb') process_pdf(rsrcmgr, device, fp) fp.close() device.close() str = retstr.getvalue() retstr.close() return str