Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1from typing import List 

2 

3from pandas._typing import FilePathOrBuffer, Scalar 

4from pandas.compat._optional import import_optional_dependency 

5 

6import pandas as pd 

7 

8from pandas.io.excel._base import _BaseExcelReader 

9 

10 

11class _ODFReader(_BaseExcelReader): 

12 """ 

13 Read tables out of OpenDocument formatted files. 

14 

15 Parameters 

16 ---------- 

17 filepath_or_buffer: string, path to be parsed or 

18 an open readable stream. 

19 """ 

20 

21 def __init__(self, filepath_or_buffer: FilePathOrBuffer): 

22 import_optional_dependency("odf") 

23 super().__init__(filepath_or_buffer) 

24 

25 @property 

26 def _workbook_class(self): 

27 from odf.opendocument import OpenDocument 

28 

29 return OpenDocument 

30 

31 def load_workbook(self, filepath_or_buffer: FilePathOrBuffer): 

32 from odf.opendocument import load 

33 

34 return load(filepath_or_buffer) 

35 

36 @property 

37 def empty_value(self) -> str: 

38 """Property for compat with other readers.""" 

39 return "" 

40 

41 @property 

42 def sheet_names(self) -> List[str]: 

43 """Return a list of sheet names present in the document""" 

44 from odf.table import Table 

45 

46 tables = self.book.getElementsByType(Table) 

47 return [t.getAttribute("name") for t in tables] 

48 

49 def get_sheet_by_index(self, index: int): 

50 from odf.table import Table 

51 

52 tables = self.book.getElementsByType(Table) 

53 return tables[index] 

54 

55 def get_sheet_by_name(self, name: str): 

56 from odf.table import Table 

57 

58 tables = self.book.getElementsByType(Table) 

59 

60 for table in tables: 

61 if table.getAttribute("name") == name: 

62 return table 

63 

64 raise ValueError(f"sheet {name} not found") 

65 

66 def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]: 

67 """Parse an ODF Table into a list of lists 

68 """ 

69 from odf.table import CoveredTableCell, TableCell, TableRow 

70 

71 covered_cell_name = CoveredTableCell().qname 

72 table_cell_name = TableCell().qname 

73 cell_names = {covered_cell_name, table_cell_name} 

74 

75 sheet_rows = sheet.getElementsByType(TableRow) 

76 empty_rows = 0 

77 max_row_len = 0 

78 

79 table: List[List[Scalar]] = [] 

80 

81 for i, sheet_row in enumerate(sheet_rows): 

82 sheet_cells = [x for x in sheet_row.childNodes if x.qname in cell_names] 

83 empty_cells = 0 

84 table_row: List[Scalar] = [] 

85 

86 for j, sheet_cell in enumerate(sheet_cells): 

87 if sheet_cell.qname == table_cell_name: 

88 value = self._get_cell_value(sheet_cell, convert_float) 

89 else: 

90 value = self.empty_value 

91 

92 column_repeat = self._get_column_repeat(sheet_cell) 

93 

94 # Queue up empty values, writing only if content succeeds them 

95 if value == self.empty_value: 

96 empty_cells += column_repeat 

97 else: 

98 table_row.extend([self.empty_value] * empty_cells) 

99 empty_cells = 0 

100 table_row.extend([value] * column_repeat) 

101 

102 if max_row_len < len(table_row): 

103 max_row_len = len(table_row) 

104 

105 row_repeat = self._get_row_repeat(sheet_row) 

106 if self._is_empty_row(sheet_row): 

107 empty_rows += row_repeat 

108 else: 

109 # add blank rows to our table 

110 table.extend([[self.empty_value]] * empty_rows) 

111 empty_rows = 0 

112 for _ in range(row_repeat): 

113 table.append(table_row) 

114 

115 # Make our table square 

116 for row in table: 

117 if len(row) < max_row_len: 

118 row.extend([self.empty_value] * (max_row_len - len(row))) 

119 

120 return table 

121 

122 def _get_row_repeat(self, row) -> int: 

123 """Return number of times this row was repeated 

124 Repeating an empty row appeared to be a common way 

125 of representing sparse rows in the table. 

126 """ 

127 from odf.namespaces import TABLENS 

128 

129 return int(row.attributes.get((TABLENS, "number-rows-repeated"), 1)) 

130 

131 def _get_column_repeat(self, cell) -> int: 

132 from odf.namespaces import TABLENS 

133 

134 return int(cell.attributes.get((TABLENS, "number-columns-repeated"), 1)) 

135 

136 def _is_empty_row(self, row) -> bool: 

137 """Helper function to find empty rows 

138 """ 

139 for column in row.childNodes: 

140 if len(column.childNodes) > 0: 

141 return False 

142 

143 return True 

144 

145 def _get_cell_value(self, cell, convert_float: bool) -> Scalar: 

146 from odf.namespaces import OFFICENS 

147 

148 cell_type = cell.attributes.get((OFFICENS, "value-type")) 

149 if cell_type == "boolean": 

150 if str(cell) == "TRUE": 

151 return True 

152 return False 

153 if cell_type is None: 

154 return self.empty_value 

155 elif cell_type == "float": 

156 # GH5394 

157 cell_value = float(cell.attributes.get((OFFICENS, "value"))) 

158 

159 if cell_value == 0.0: # NA handling 

160 return str(cell) 

161 

162 if convert_float: 

163 val = int(cell_value) 

164 if val == cell_value: 

165 return val 

166 return cell_value 

167 elif cell_type == "percentage": 

168 cell_value = cell.attributes.get((OFFICENS, "value")) 

169 return float(cell_value) 

170 elif cell_type == "string": 

171 return str(cell) 

172 elif cell_type == "currency": 

173 cell_value = cell.attributes.get((OFFICENS, "value")) 

174 return float(cell_value) 

175 elif cell_type == "date": 

176 cell_value = cell.attributes.get((OFFICENS, "date-value")) 

177 return pd.to_datetime(cell_value) 

178 elif cell_type == "time": 

179 return pd.to_datetime(str(cell)).time() 

180 else: 

181 raise ValueError(f"Unrecognized type {cell_type}")