Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/pandas/io/excel/_odfreader.py : 18%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1from typing import List
3from pandas._typing import FilePathOrBuffer, Scalar
4from pandas.compat._optional import import_optional_dependency
6import pandas as pd
8from pandas.io.excel._base import _BaseExcelReader
11class _ODFReader(_BaseExcelReader):
12 """
13 Read tables out of OpenDocument formatted files.
15 Parameters
16 ----------
17 filepath_or_buffer: string, path to be parsed or
18 an open readable stream.
19 """
21 def __init__(self, filepath_or_buffer: FilePathOrBuffer):
22 import_optional_dependency("odf")
23 super().__init__(filepath_or_buffer)
25 @property
26 def _workbook_class(self):
27 from odf.opendocument import OpenDocument
29 return OpenDocument
31 def load_workbook(self, filepath_or_buffer: FilePathOrBuffer):
32 from odf.opendocument import load
34 return load(filepath_or_buffer)
36 @property
37 def empty_value(self) -> str:
38 """Property for compat with other readers."""
39 return ""
41 @property
42 def sheet_names(self) -> List[str]:
43 """Return a list of sheet names present in the document"""
44 from odf.table import Table
46 tables = self.book.getElementsByType(Table)
47 return [t.getAttribute("name") for t in tables]
49 def get_sheet_by_index(self, index: int):
50 from odf.table import Table
52 tables = self.book.getElementsByType(Table)
53 return tables[index]
55 def get_sheet_by_name(self, name: str):
56 from odf.table import Table
58 tables = self.book.getElementsByType(Table)
60 for table in tables:
61 if table.getAttribute("name") == name:
62 return table
64 raise ValueError(f"sheet {name} not found")
66 def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]:
67 """Parse an ODF Table into a list of lists
68 """
69 from odf.table import CoveredTableCell, TableCell, TableRow
71 covered_cell_name = CoveredTableCell().qname
72 table_cell_name = TableCell().qname
73 cell_names = {covered_cell_name, table_cell_name}
75 sheet_rows = sheet.getElementsByType(TableRow)
76 empty_rows = 0
77 max_row_len = 0
79 table: List[List[Scalar]] = []
81 for i, sheet_row in enumerate(sheet_rows):
82 sheet_cells = [x for x in sheet_row.childNodes if x.qname in cell_names]
83 empty_cells = 0
84 table_row: List[Scalar] = []
86 for j, sheet_cell in enumerate(sheet_cells):
87 if sheet_cell.qname == table_cell_name:
88 value = self._get_cell_value(sheet_cell, convert_float)
89 else:
90 value = self.empty_value
92 column_repeat = self._get_column_repeat(sheet_cell)
94 # Queue up empty values, writing only if content succeeds them
95 if value == self.empty_value:
96 empty_cells += column_repeat
97 else:
98 table_row.extend([self.empty_value] * empty_cells)
99 empty_cells = 0
100 table_row.extend([value] * column_repeat)
102 if max_row_len < len(table_row):
103 max_row_len = len(table_row)
105 row_repeat = self._get_row_repeat(sheet_row)
106 if self._is_empty_row(sheet_row):
107 empty_rows += row_repeat
108 else:
109 # add blank rows to our table
110 table.extend([[self.empty_value]] * empty_rows)
111 empty_rows = 0
112 for _ in range(row_repeat):
113 table.append(table_row)
115 # Make our table square
116 for row in table:
117 if len(row) < max_row_len:
118 row.extend([self.empty_value] * (max_row_len - len(row)))
120 return table
122 def _get_row_repeat(self, row) -> int:
123 """Return number of times this row was repeated
124 Repeating an empty row appeared to be a common way
125 of representing sparse rows in the table.
126 """
127 from odf.namespaces import TABLENS
129 return int(row.attributes.get((TABLENS, "number-rows-repeated"), 1))
131 def _get_column_repeat(self, cell) -> int:
132 from odf.namespaces import TABLENS
134 return int(cell.attributes.get((TABLENS, "number-columns-repeated"), 1))
136 def _is_empty_row(self, row) -> bool:
137 """Helper function to find empty rows
138 """
139 for column in row.childNodes:
140 if len(column.childNodes) > 0:
141 return False
143 return True
145 def _get_cell_value(self, cell, convert_float: bool) -> Scalar:
146 from odf.namespaces import OFFICENS
148 cell_type = cell.attributes.get((OFFICENS, "value-type"))
149 if cell_type == "boolean":
150 if str(cell) == "TRUE":
151 return True
152 return False
153 if cell_type is None:
154 return self.empty_value
155 elif cell_type == "float":
156 # GH5394
157 cell_value = float(cell.attributes.get((OFFICENS, "value")))
159 if cell_value == 0.0: # NA handling
160 return str(cell)
162 if convert_float:
163 val = int(cell_value)
164 if val == cell_value:
165 return val
166 return cell_value
167 elif cell_type == "percentage":
168 cell_value = cell.attributes.get((OFFICENS, "value"))
169 return float(cell_value)
170 elif cell_type == "string":
171 return str(cell)
172 elif cell_type == "currency":
173 cell_value = cell.attributes.get((OFFICENS, "value"))
174 return float(cell_value)
175 elif cell_type == "date":
176 cell_value = cell.attributes.get((OFFICENS, "date-value"))
177 return pd.to_datetime(cell_value)
178 elif cell_type == "time":
179 return pd.to_datetime(str(cell)).time()
180 else:
181 raise ValueError(f"Unrecognized type {cell_type}")