Source code for simplesqlite.loader.spreadsheet.excelloader

# encoding: utf-8

"""
.. codeauthor:: Tsuyoshi Hombashi <gogogo.vm@gmail.com>
"""


from __future__ import absolute_import

import path
from six.moves import range
import xlrd

from ..error import InvalidDataError
from ..data import TableData
from .core import SpreadSheetLoader


[docs]class ExcelTableFileLoader(SpreadSheetLoader): """ Concrete class of Microsoft Excel |TM| file loader. """ @property def _sheet_name(self): return self._worksheet.name @property def _row_count(self): return self._worksheet.nrows @property def _col_count(self): return self._worksheet.ncols def __init__(self, file_path=None): super(ExcelTableFileLoader, self).__init__(file_path)
[docs] def make_table_name(self): """ |make_table_name| ================ =========================== format specifier value after the replacement ================ =========================== ``%(filename)s`` Filename of the workbook ``%(sheet)s`` Name of the sheet ================ =========================== :return: Table name. :rtype: str """ self._validate_source() table_name = super(ExcelTableFileLoader, self).make_table_name() return table_name.replace( "%(filename)s", path.Path(self.source).namebase)
[docs] def load(self): """ Load table data from an Excel file. |load_desc| :return: |load_return| :py:meth:`~.ExcelTableFileLoader.make_table_name`. :rtype: iterator of |TableData| :raises InvalidDataError: If the header row is not found. """ self._validate() try: workbook = xlrd.open_workbook(self.source) except xlrd.biffh.XLRDError as e: raise InvalidDataError(e) for worksheet in workbook.sheets(): self._worksheet = worksheet if self._is_empty_sheet(): continue self.__extract_not_empty_col_idx() try: start_row_idx = self._get_start_row_idx() except InvalidDataError: continue header_list = self.__get_row_values(start_row_idx) record_list = [ self.__get_row_values(row_idx) for row_idx in range(start_row_idx + 1, self._row_count) ] yield TableData(self.make_table_name(), header_list, record_list)
def _is_empty_sheet(self): return any([ self._col_count == 0, self._row_count <= 1, # nrows == 1 means exists header row only ]) def _get_start_row_idx(self): for row_idx in range(self.start_row, self._row_count): if self.__is_header_row(row_idx): break else: raise InvalidDataError("header row is not found") return row_idx def __is_header_row(self, row_idx): cell_type_list = self._worksheet.row_types( row_idx, self._start_col_idx, self._end_col_idx + 1) return xlrd.XL_CELL_EMPTY not in cell_type_list @staticmethod def __is_empty_cell_type_list(cell_type_list): return all([ cell_type == xlrd.XL_CELL_EMPTY for cell_type in cell_type_list ]) def __extract_not_empty_col_idx(self): col_idx_list = [ col_idx for col_idx in range(self._col_count) if not self.__is_empty_cell_type_list( self._worksheet.col_types(col_idx)) ] self._start_col_idx = min(col_idx_list) self._end_col_idx = max(col_idx_list) def __get_row_values(self, row_idx): return self._worksheet.row_values( row_idx, self._start_col_idx, self._end_col_idx + 1)