Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# Copyright (c) 2010-2020 openpyxl 

2 

3"""Reader for a single worksheet.""" 

4import traceback 

5from warnings import warn 

6 

7# compatibility imports 

8from openpyxl.xml.functions import iterparse 

9 

10# package imports 

11from openpyxl.cell import Cell, MergedCell 

12from openpyxl.cell.text import Text 

13from openpyxl.worksheet.dimensions import ( 

14 ColumnDimension, 

15 RowDimension, 

16 SheetFormatProperties, 

17) 

18 

19from openpyxl.xml.constants import ( 

20 SHEET_MAIN_NS, 

21 EXT_TYPES, 

22) 

23from openpyxl.formatting.formatting import ConditionalFormatting 

24from openpyxl.formula.translate import Translator 

25from openpyxl.utils import ( 

26 get_column_letter, 

27 coordinate_to_tuple, 

28 ) 

29from openpyxl.utils.datetime import from_excel, from_ISO8601, WINDOWS_EPOCH 

30from openpyxl.descriptors.excel import ExtensionList 

31 

32from .filters import AutoFilter 

33from .header_footer import HeaderFooter 

34from .hyperlink import HyperlinkList 

35from .merge import MergeCells 

36from .page import PageMargins, PrintOptions, PrintPageSetup 

37from .pagebreak import RowBreak, ColBreak 

38from .protection import SheetProtection 

39from .scenario import ScenarioList 

40from .views import SheetViewList 

41from .datavalidation import DataValidationList 

42from .table import TablePartList 

43from .properties import WorksheetProperties 

44from .dimensions import SheetDimension 

45from .related import Related 

46 

47 

48CELL_TAG = '{%s}c' % SHEET_MAIN_NS 

49VALUE_TAG = '{%s}v' % SHEET_MAIN_NS 

50FORMULA_TAG = '{%s}f' % SHEET_MAIN_NS 

51MERGE_TAG = '{%s}mergeCells' % SHEET_MAIN_NS 

52INLINE_STRING = "{%s}is" % SHEET_MAIN_NS 

53COL_TAG = '{%s}col' % SHEET_MAIN_NS 

54ROW_TAG = '{%s}row' % SHEET_MAIN_NS 

55CF_TAG = '{%s}conditionalFormatting' % SHEET_MAIN_NS 

56LEGACY_TAG = '{%s}legacyDrawing' % SHEET_MAIN_NS 

57PROT_TAG = '{%s}sheetProtection' % SHEET_MAIN_NS 

58EXT_TAG = "{%s}extLst" % SHEET_MAIN_NS 

59HYPERLINK_TAG = "{%s}hyperlinks" % SHEET_MAIN_NS 

60TABLE_TAG = "{%s}tableParts" % SHEET_MAIN_NS 

61PRINT_TAG = '{%s}printOptions' % SHEET_MAIN_NS 

62MARGINS_TAG = '{%s}pageMargins' % SHEET_MAIN_NS 

63PAGE_TAG = '{%s}pageSetup' % SHEET_MAIN_NS 

64HEADER_TAG = '{%s}headerFooter' % SHEET_MAIN_NS 

65FILTER_TAG = '{%s}autoFilter' % SHEET_MAIN_NS 

66VALIDATION_TAG = '{%s}dataValidations' % SHEET_MAIN_NS 

67PROPERTIES_TAG = '{%s}sheetPr' % SHEET_MAIN_NS 

68VIEWS_TAG = '{%s}sheetViews' % SHEET_MAIN_NS 

69FORMAT_TAG = '{%s}sheetFormatPr' % SHEET_MAIN_NS 

70ROW_BREAK_TAG = '{%s}rowBreaks' % SHEET_MAIN_NS 

71COL_BREAK_TAG = '{%s}colBreaks' % SHEET_MAIN_NS 

72SCENARIOS_TAG = '{%s}scenarios' % SHEET_MAIN_NS 

73DATA_TAG = '{%s}sheetData' % SHEET_MAIN_NS 

74DIMENSION_TAG = '{%s}dimension' % SHEET_MAIN_NS 

75CUSTOM_VIEWS_TAG = '{%s}customSheetViews' % SHEET_MAIN_NS 

76 

77 

78def _cast_number(value): 

79 "Convert numbers as string to an int or float" 

80 if "." in value or "E" in value or "e" in value: 

81 return float(value) 

82 return int(value) 

83 

84 

85class WorkSheetParser(object): 

86 

87 def __init__(self, src, shared_strings, data_only=False, 

88 epoch=WINDOWS_EPOCH, date_formats=set()): 

89 self.min_row = self.min_col = None 

90 self.epoch = epoch 

91 self.source = src 

92 self.shared_strings = shared_strings 

93 self.data_only = data_only 

94 self.shared_formulae = {} 

95 self.array_formulae = {} 

96 self.row_counter = self.col_counter = 0 

97 self.tables = TablePartList() 

98 self.date_formats = date_formats 

99 self.row_dimensions = {} 

100 self.column_dimensions = {} 

101 self.number_formats = [] 

102 self.keep_vba = False 

103 self.hyperlinks = HyperlinkList() 

104 self.formatting = [] 

105 self.legacy_drawing = None 

106 self.merged_cells = None 

107 self.row_breaks = RowBreak() 

108 self.col_breaks = ColBreak() 

109 

110 

111 def parse(self): 

112 dispatcher = { 

113 COL_TAG: self.parse_column_dimensions, 

114 PROT_TAG: self.parse_sheet_protection, 

115 EXT_TAG: self.parse_extensions, 

116 CF_TAG: self.parse_formatting, 

117 LEGACY_TAG: self.parse_legacy, 

118 ROW_BREAK_TAG: self.parse_row_breaks, 

119 COL_BREAK_TAG: self.parse_col_breaks, 

120 CUSTOM_VIEWS_TAG: self.parse_custom_views, 

121 } 

122 

123 properties = { 

124 PRINT_TAG: ('print_options', PrintOptions), 

125 MARGINS_TAG: ('page_margins', PageMargins), 

126 PAGE_TAG: ('page_setup', PrintPageSetup), 

127 HEADER_TAG: ('HeaderFooter', HeaderFooter), 

128 FILTER_TAG: ('auto_filter', AutoFilter), 

129 VALIDATION_TAG: ('data_validations', DataValidationList), 

130 PROPERTIES_TAG: ('sheet_properties', WorksheetProperties), 

131 VIEWS_TAG: ('views', SheetViewList), 

132 FORMAT_TAG: ('sheet_format', SheetFormatProperties), 

133 SCENARIOS_TAG: ('scenarios', ScenarioList), 

134 TABLE_TAG: ('tables', TablePartList), 

135 HYPERLINK_TAG: ('hyperlinks', HyperlinkList), 

136 MERGE_TAG: ('merged_cells', MergeCells), 

137 

138 } 

139 

140 it = iterparse(self.source) 

141 

142 for _, element in it: 

143 tag_name = element.tag 

144 if tag_name in dispatcher: 

145 dispatcher[tag_name](element) 

146 element.clear() 

147 elif tag_name in properties: 

148 prop = properties[tag_name] 

149 obj = prop[1].from_tree(element) 

150 setattr(self, prop[0], obj) 

151 element.clear() 

152 elif tag_name == ROW_TAG: 

153 row = self.parse_row(element) 

154 element.clear() 

155 yield row 

156 

157 

158 def parse_dimensions(self): 

159 """ 

160 Get worksheet dimensions if they are provided. 

161 """ 

162 it = iterparse(self.source) 

163 

164 for _event, element in it: 

165 if element.tag == DIMENSION_TAG: 

166 dim = SheetDimension.from_tree(element) 

167 return dim.boundaries 

168 

169 elif element.tag == DATA_TAG: 

170 # Dimensions missing 

171 break 

172 element.clear() 

173 

174 

175 def parse_cell(self, element): 

176 data_type = element.get('t', 'n') 

177 coordinate = element.get('r') 

178 self.col_counter += 1 

179 style_id = element.get('s', 0) 

180 if style_id: 

181 style_id = int(style_id) 

182 

183 if data_type == "inlineStr": 

184 value = None 

185 else: 

186 value = element.findtext(VALUE_TAG, None) or None 

187 

188 if coordinate: 

189 row, column = coordinate_to_tuple(coordinate) 

190 else: 

191 row, column = self.row_counter, self.col_counter 

192 

193 if not self.data_only and element.find(FORMULA_TAG) is not None: 

194 data_type = 'f' 

195 value = self.parse_formula(element) 

196 

197 elif value is not None: 

198 if data_type == 'n': 

199 value = _cast_number(value) 

200 if style_id in self.date_formats: 

201 data_type = 'd' 

202 try: 

203 value = from_excel(value, self.epoch) 

204 except ValueError: 

205 msg = """Cell {0} is marked as a date but the serial value {1} is outside the limits for dates. The cell will be treated as an error.""".format(coordinate, value) 

206 warn(msg) 

207 data_type = "e" 

208 value = "#VALUE!" 

209 elif data_type == 's': 

210 value = self.shared_strings[int(value)] 

211 elif data_type == 'b': 

212 value = bool(int(value)) 

213 elif data_type == "str": 

214 data_type = "s" 

215 elif data_type == 'd': 

216 value = from_ISO8601(value) 

217 

218 elif data_type == 'inlineStr': 

219 child = element.find(INLINE_STRING) 

220 if child is not None: 

221 data_type = 's' 

222 richtext = Text.from_tree(child) 

223 value = richtext.content 

224 

225 return {'row':row, 'column':column, 'value':value, 'data_type':data_type, 'style_id':style_id} 

226 

227 

228 def parse_formula(self, element): 

229 """ 

230 possible formulae types: shared, array, datatable 

231 """ 

232 formula = element.find(FORMULA_TAG) 

233 formula_type = formula.get('t') 

234 coordinate = element.get('r') 

235 value = "=" 

236 if formula.text is not None: 

237 value += formula.text 

238 

239 if formula_type == "array": 

240 self.array_formulae[coordinate] = dict(formula.attrib) 

241 

242 elif formula_type == "shared": 

243 idx = formula.get('si') 

244 if idx in self.shared_formulae: 

245 trans = self.shared_formulae[idx] 

246 value = trans.translate_formula(coordinate) 

247 elif value != "=": 

248 self.shared_formulae[idx] = Translator(value, coordinate) 

249 

250 return value 

251 

252 

253 def parse_column_dimensions(self, col): 

254 attrs = dict(col.attrib) 

255 column = get_column_letter(int(attrs['min'])) 

256 attrs['index'] = column 

257 self.column_dimensions[column] = attrs 

258 

259 

260 def parse_row(self, row): 

261 attrs = dict(row.attrib) 

262 

263 if "r" in attrs: 

264 self.row_counter = int(attrs['r']) 

265 else: 

266 self.row_counter += 1 

267 self.col_counter = 0 

268 

269 keys = {k for k in attrs if not k.startswith('{')} 

270 if keys - {'r', 'spans'}: 

271 # don't create dimension objects unless they have relevant information 

272 self.row_dimensions[str(self.row_counter)] = attrs 

273 

274 cells = [self.parse_cell(el) for el in row] 

275 return self.row_counter, cells 

276 

277 

278 def parse_formatting(self, element): 

279 try: 

280 cf = ConditionalFormatting.from_tree(element) 

281 self.formatting.append(cf) 

282 except TypeError: 

283 msg = f"Failed to load a conditional formatting rule. It will be discarded. Cause: {traceback.format_exc()}" 

284 warn(msg) 

285 

286 

287 def parse_sheet_protection(self, element): 

288 protection = SheetProtection.from_tree(element) 

289 password = element.get("password") 

290 if password is not None: 

291 protection.set_password(password, True) 

292 self.protection = protection 

293 

294 

295 def parse_extensions(self, element): 

296 extLst = ExtensionList.from_tree(element) 

297 for e in extLst.ext: 

298 ext_type = EXT_TYPES.get(e.uri.upper(), "Unknown") 

299 msg = "{0} extension is not supported and will be removed".format(ext_type) 

300 warn(msg) 

301 

302 

303 def parse_legacy(self, element): 

304 obj = Related.from_tree(element) 

305 self.legacy_drawing = obj.id 

306 

307 

308 def parse_row_breaks(self, element): 

309 brk = RowBreak.from_tree(element) 

310 self.row_breaks = brk 

311 

312 

313 def parse_col_breaks(self, element): 

314 brk = ColBreak.from_tree(element) 

315 self.col_breaks = brk 

316 

317 

318 def parse_custom_views(self, element): 

319 # clear page_breaks to avoid duplication which Excel doesn't like 

320 # basically they're ignored in custom views 

321 self.row_breaks = RowBreak() 

322 self.col_breaks = ColBreak() 

323 

324 

325class WorksheetReader(object): 

326 """ 

327 Create a parser and apply it to a workbook 

328 """ 

329 

330 def __init__(self, ws, xml_source, shared_strings, data_only): 

331 self.ws = ws 

332 self.parser = WorkSheetParser(xml_source, shared_strings, data_only, ws.parent.epoch, ws.parent._date_formats) 

333 self.tables = [] 

334 

335 

336 def bind_cells(self): 

337 for idx, row in self.parser.parse(): 

338 for cell in row: 

339 style = self.ws.parent._cell_styles[cell['style_id']] 

340 c = Cell(self.ws, row=cell['row'], column=cell['column'], style_array=style) 

341 c._value = cell['value'] 

342 c.data_type = cell['data_type'] 

343 self.ws._cells[(cell['row'], cell['column'])] = c 

344 self.ws.formula_attributes = self.parser.array_formulae 

345 if self.ws._cells: 

346 self.ws._current_row = self.ws.max_row # use cells not row dimensions 

347 

348 

349 def bind_formatting(self): 

350 for cf in self.parser.formatting: 

351 for rule in cf.rules: 

352 if rule.dxfId is not None: 

353 rule.dxf = self.ws.parent._differential_styles[rule.dxfId] 

354 self.ws.conditional_formatting[cf] = rule 

355 

356 

357 def bind_tables(self): 

358 for t in self.parser.tables.tablePart: 

359 rel = self.ws._rels[t.id] 

360 self.tables.append(rel.Target) 

361 

362 

363 def bind_merged_cells(self): 

364 from openpyxl.worksheet.cell_range import MultiCellRange 

365 from openpyxl.worksheet.merge import MergedCellRange 

366 if not self.parser.merged_cells: 

367 return 

368 

369 ranges = [] 

370 for cr in self.parser.merged_cells.mergeCell: 

371 mcr = MergedCellRange(self.ws, cr.ref) 

372 self.ws._clean_merge_range(mcr) 

373 ranges.append(mcr) 

374 self.ws.merged_cells = MultiCellRange(ranges) 

375 

376 

377 def bind_hyperlinks(self): 

378 for link in self.parser.hyperlinks.hyperlink: 

379 if link.id: 

380 rel = self.ws._rels[link.id] 

381 link.target = rel.Target 

382 if ":" in link.ref: 

383 # range of cells 

384 for row in self.ws[link.ref]: 

385 for cell in row: 

386 try: 

387 cell.hyperlink = link 

388 except AttributeError: 

389 pass 

390 else: 

391 cell = self.ws[link.ref] 

392 if isinstance(cell, MergedCell): 

393 cell = self.normalize_merged_cell_link(cell.coordinate) 

394 cell.hyperlink = link 

395 

396 def normalize_merged_cell_link(self, coord): 

397 """ 

398 Returns the appropriate cell to which a hyperlink, which references a merged cell at the specified coordinates, 

399 should be bound. 

400 """ 

401 for rng in self.ws.merged_cells: 

402 if coord in rng: 

403 return self.ws.cell(*rng.top[0]) 

404 

405 def bind_col_dimensions(self): 

406 for col, cd in self.parser.column_dimensions.items(): 

407 if 'style' in cd: 

408 key = int(cd['style']) 

409 cd['style'] = self.ws.parent._cell_styles[key] 

410 self.ws.column_dimensions[col] = ColumnDimension(self.ws, **cd) 

411 

412 

413 def bind_row_dimensions(self): 

414 for row, rd in self.parser.row_dimensions.items(): 

415 if 's' in rd: 

416 key = int(rd['s']) 

417 rd['s'] = self.ws.parent._cell_styles[key] 

418 self.ws.row_dimensions[int(row)] = RowDimension(self.ws, **rd) 

419 

420 

421 def bind_properties(self): 

422 for k in ('print_options', 'page_margins', 'page_setup', 

423 'HeaderFooter', 'auto_filter', 'data_validations', 

424 'sheet_properties', 'views', 'sheet_format', 

425 'row_breaks', 'col_breaks', 'scenarios', 'legacy_drawing', 

426 'protection', 

427 ): 

428 v = getattr(self.parser, k, None) 

429 if v is not None: 

430 setattr(self.ws, k, v) 

431 

432 

433 def bind_all(self): 

434 self.bind_cells() 

435 self.bind_merged_cells() 

436 self.bind_hyperlinks() 

437 self.bind_formatting() 

438 self.bind_col_dimensions() 

439 self.bind_row_dimensions() 

440 self.bind_tables() 

441 self.bind_properties()