Coverage for cc_modules/cc_spreadsheet.py: 29%

266 statements  

« prev     ^ index     » next       coverage.py v7.9.2, created at 2025-07-15 14:23 +0100

1""" 

2camcops_server/cc_modules/cc_spreadsheet.py 

3 

4=============================================================================== 

5 

6 Copyright (C) 2012, University of Cambridge, Department of Psychiatry. 

7 Created by Rudolf Cardinal (rnc1001@cam.ac.uk). 

8 

9 This file is part of CamCOPS. 

10 

11 CamCOPS is free software: you can redistribute it and/or modify 

12 it under the terms of the GNU General Public License as published by 

13 the Free Software Foundation, either version 3 of the License, or 

14 (at your option) any later version. 

15 

16 CamCOPS is distributed in the hope that it will be useful, 

17 but WITHOUT ANY WARRANTY; without even the implied warranty of 

18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

19 GNU General Public License for more details. 

20 

21 You should have received a copy of the GNU General Public License 

22 along with CamCOPS. If not, see <https://www.gnu.org/licenses/>. 

23 

24=============================================================================== 

25 

26**Helper functions/classes for spreadsheet-style tab-separated value (TSV) 

27(and related) exports.** 

28 

29""" 

30 

31from collections import OrderedDict 

32import csv 

33import io 

34import logging 

35import os 

36import random 

37import re 

38from typing import ( 

39 Any, 

40 BinaryIO, 

41 Callable, 

42 Container, 

43 Dict, 

44 Iterable, 

45 List, 

46 Optional, 

47 Sequence, 

48 Union, 

49) 

50import zipfile 

51 

52from cardinal_pythonlib.datetimefunc import ( 

53 format_datetime, 

54 get_now_localtz_pendulum, 

55) 

56from cardinal_pythonlib.excel import ( 

57 convert_for_openpyxl, 

58 convert_for_pyexcel_ods3, 

59) 

60from cardinal_pythonlib.logs import BraceStyleAdapter 

61from sqlalchemy.engine import Result 

62 

63from camcops_server.cc_modules.cc_constants import DateFormat 

64 

65ODS_VIA_PYEXCEL = True # significantly faster 

66XLSX_VIA_PYEXCEL = True 

67 

68if ODS_VIA_PYEXCEL: 

69 import pyexcel_ods3 # e.g. pip install pyexcel-ods3==0.5.3 

70 

71 ODSWriter = ODSSheet = None 

72else: 

73 from odswriter import ODSWriter, Sheet as ODSSheet # type: ignore[no-redef] # noqa: E501 

74 

75 pyexcel_ods3 = None 

76 

77if XLSX_VIA_PYEXCEL: 

78 import pyexcel_xlsx # e.g. pip install pyexcel-xlsx==0.5.7 

79 

80 openpyxl = XLWorkbook = XLWorksheet = None 

81else: 

82 from openpyxl.workbook.workbook import Workbook as XLWorkbook # type: ignore[no-redef] # noqa: E501 

83 from openpyxl.worksheet.worksheet import Worksheet as XLWorksheet # type: ignore[no-redef] # noqa: E501 

84 

85 pyexcel_xlsx = None 

86 

87log = BraceStyleAdapter(logging.getLogger(__name__)) 

88 

89 

90# ============================================================================= 

91# Spreadsheet output holding structures 

92# ============================================================================= 

93 

94 

95class SpreadsheetPage(object): 

96 """ 

97 Represents a single "spreadsheet" page, e.g. for TSV/Excel/ODS output. 

98 """ 

99 

100 def __init__( 

101 self, name: str, rows: List[Union[Dict[str, Any], OrderedDict]] 

102 ) -> None: 

103 """ 

104 Args: 

105 name: name for the whole sheet 

106 rows: list of rows, where each row is a dictionary mapping 

107 column name to value 

108 """ 

109 assert name, "Missing name" 

110 self.name = name 

111 self.rows = rows 

112 self.headings = [] # type: List[str] 

113 for row in rows: 

114 self._add_headings_if_absent(row.keys()) 

115 

116 def __str__(self) -> str: 

117 return f"SpreadsheetPage: name={self.name}\n{self.get_tsv()}" 

118 

119 @classmethod 

120 def from_headings_rows( 

121 cls, name: str, headings: List[str], rows: List[Sequence[Any]] 

122 ) -> "SpreadsheetPage": 

123 """ 

124 Creates a SpreadsheetPage object using a list of headings and the row 

125 data as a list of lists. 

126 """ 

127 page = cls(name=name, rows=[]) 

128 n_cols = len(headings) 

129 page.headings = headings 

130 for row in rows: 

131 assert len(row) == n_cols 

132 page.rows.append(dict(zip(headings, row))) 

133 return page 

134 

135 @classmethod 

136 def from_result(cls, name: str, rp: Result) -> "SpreadsheetPage": 

137 """ 

138 Creates a SpreadsheetPage object from an SQLAlchemy Result. 

139 

140 Args: 

141 rp: 

142 A :class:` sqlalchemy.engine.Result`. 

143 name: 

144 Name for this sheet. 

145 """ 

146 column_names = rp.keys() 

147 rows = rp.fetchall() 

148 return cls.from_headings_rows( 

149 name=name, headings=column_names, rows=rows # type: ignore[arg-type] # noqa: E501 

150 ) 

151 

152 @property 

153 def empty(self) -> bool: 

154 """ 

155 Do we have zero rows? 

156 """ 

157 return len(self.rows) == 0 

158 

159 def _add_headings_if_absent(self, headings: Iterable[str]) -> None: 

160 """ 

161 Add any headings we've not yet seen to our list of headings. 

162 """ 

163 for h in headings: 

164 if h not in self.headings: 

165 self.headings.append(h) 

166 

167 def add_or_set_value(self, heading: str, value: Any) -> None: 

168 """ 

169 If we contain only a single row, this function will set the value 

170 for a given column (``heading``) to ``value``. 

171 

172 Raises: 

173 :exc:`AssertionError` if we don't have exactly 1 row 

174 """ 

175 assert len(self.rows) == 1, "add_value can only be used if #rows == 1" 

176 self._add_headings_if_absent([heading]) 

177 self.rows[0][heading] = value 

178 

179 def add_or_set_column(self, heading: str, values: List[Any]) -> None: 

180 """ 

181 Set the column labelled ``heading`` so it contains the values specified 

182 in ``values``. The length of ``values`` must equal the number of rows 

183 that we already contain. 

184 

185 Raises: 

186 :exc:`AssertionError` if the number of values doesn't match 

187 the number of existing rows 

188 """ 

189 assert len(values) == len(self.rows), "#values != #existing rows" 

190 self._add_headings_if_absent([heading]) 

191 for i, row in enumerate(self.rows): 

192 row[heading] = values[i] 

193 

194 def add_or_set_columns_from_page(self, other: "SpreadsheetPage") -> None: 

195 """ 

196 This function presupposes that ``self`` and ``other`` are two pages 

197 ("spreadsheets") with *matching* rows. 

198 

199 It updates values or creates columns in ``self`` such that the values 

200 from all columns in ``other`` are written to the corresponding rows of 

201 ``self``. 

202 

203 Raises: 

204 :exc:`AssertionError` if the two pages (sheets) don't have 

205 the same number of rows. 

206 """ 

207 assert len(self.rows) == len(other.rows), "Mismatched #rows" 

208 self._add_headings_if_absent(other.headings) 

209 for i, row in enumerate(self.rows): 

210 for k, v in other.rows[i].items(): 

211 row[k] = v 

212 

213 def add_rows_from_page(self, other: "SpreadsheetPage") -> None: 

214 """ 

215 Add all rows from ``other`` to ``self``. 

216 """ 

217 self._add_headings_if_absent(other.headings) 

218 self.rows.extend(other.rows) 

219 

220 def sort_headings(self) -> None: 

221 """ 

222 Sort our headings internally. 

223 """ 

224 self.headings.sort() 

225 

226 def delete_columns(self, headings: Container[str]) -> None: 

227 """ 

228 Removes columns with the specified heading names. 

229 Used to simplify spreadsheets. 

230 

231 Since our rows are a dictionary, and our export functions are based on 

232 the headings, all we have to do is to delete the unwanted headings. 

233 """ 

234 self.headings = [h for h in self.headings if h not in headings] 

235 

236 @property 

237 def plainrows(self) -> List[List[Any]]: 

238 """ 

239 Returns a list of rows, where each row is a list of values. 

240 Does not include a "header" row. 

241 

242 Compare :attr:`rows`, which is a list of dictionaries. 

243 """ 

244 rows = [] 

245 for row in self.rows: 

246 rows.append([row.get(h) for h in self.headings]) 

247 return rows 

248 

249 def spreadsheetrows( 

250 self, converter: Callable[[Any], Any] 

251 ) -> List[List[Any]]: 

252 """ 

253 Like :meth:`plainrows`, but (a) ensures every cell is converted to a 

254 value that can be sent to a spreadsheet converted (e.g. ODS, XLSX), and 

255 (b) includes a header row. 

256 """ 

257 rows = [self.headings.copy()] 

258 for row in self.rows: 

259 rows.append([converter(row.get(h)) for h in self.headings]) 

260 return rows 

261 

262 def get_tsv(self, dialect: str = "excel-tab") -> str: 

263 r""" 

264 Returns the entire page (sheet) as TSV: one header row and then 

265 lots of data rows. 

266 

267 For the dialect, see 

268 https://docs.python.org/3/library/csv.html#csv.excel_tab. 

269 

270 For CSV files, see RGC 4180: https://tools.ietf.org/html/rfc4180. 

271 

272 For TSV files, see 

273 https://www.iana.org/assignments/media-types/text/tab-separated-values. 

274 

275 Test code: 

276 

277 .. code-block:: python 

278 

279 import io 

280 import csv 

281 from typing import List 

282 

283 def test(row: List[str], dialect: str = "excel-tab") -> str: 

284 f = io.StringIO() 

285 writer = csv.writer(f, dialect=dialect) 

286 writer.writerow(row) 

287 return f.getvalue() 

288 

289 test(["hello", "world"]) 

290 test(["hello\ttab", "world"]) # actual tab within double quotes 

291 test(["hello\nnewline", "world"]) # actual newline within double quotes 

292 test(['hello"doublequote', "world"]) # doubled double quote within double quotes 

293 

294 """ # noqa 

295 f = io.StringIO() 

296 writer = csv.writer(f, dialect=dialect) 

297 writer.writerow(self.headings) 

298 for row in self.rows: 

299 writer.writerow([row.get(h) for h in self.headings]) 

300 return f.getvalue() 

301 

302 def write_to_openpyxl_xlsx_worksheet(self, ws: "XLWorksheet") -> None: # type: ignore[valid-type] # noqa: E501 

303 """ 

304 Writes data from this page to an existing ``openpyxl`` XLSX worksheet. 

305 """ 

306 ws.append(self.headings) # type: ignore[attr-defined] 

307 for row in self.rows: 

308 ws.append( # type: ignore[attr-defined] 

309 [convert_for_openpyxl(row.get(h)) for h in self.headings] 

310 ) 

311 

312 def write_to_odswriter_ods_worksheet(self, ws: "ODSSheet") -> None: # type: ignore[valid-type] # noqa: E501 

313 """ 

314 Writes data from this page to an existing ``odswriter`` ODS sheet. 

315 """ 

316 # noinspection PyUnresolvedReferences 

317 ws.writerow(self.headings) # type: ignore[attr-defined] 

318 for row in self.rows: 

319 # noinspection PyUnresolvedReferences 

320 ws.writerow([row.get(h) for h in self.headings]) # type: ignore[attr-defined] # noqa: E501 

321 

322 def r_object_name(self) -> str: 

323 """ 

324 Name of the object when imported into R. 

325 The main thing: no leading underscores. 

326 """ 

327 n = self.name 

328 n = n[1:] if n.startswith("_") else n 

329 return f"camcops_{n}" # less chance of conflict within R 

330 

331 def r_data_table_definition(self) -> str: 

332 """ 

333 Returns a string to define this object as a ``data.table`` in R. 

334 

335 See also: 

336 

337 - https://stackoverflow.com/questions/32103639/read-csv-file-in-r-with-double-quotes 

338 """ # noqa 

339 object_name = self.r_object_name() 

340 csv_text = self.get_tsv(dialect="excel") 

341 csv_text = csv_text.replace('"', r"\"") 

342 definition = ( 

343 f'data.table::fread(sep=",", header=TRUE, text="{csv_text}"\n)' 

344 ) 

345 return f"{object_name} <- {definition}" 

346 

347 

348class SpreadsheetCollection(object): 

349 """ 

350 A collection of 

351 :class:`camcops_server.cc_modules.cc_spreadsheet.SpreadsheetPage` pages 

352 (spreadsheets), like an Excel workbook. 

353 """ 

354 

355 def __init__(self) -> None: 

356 self.pages = [] # type: List[SpreadsheetPage] 

357 

358 def __str__(self) -> str: 

359 return "SpreadsheetCollection:\n" + "\n\n".join( 

360 page.get_tsv() for page in self.pages 

361 ) 

362 

363 # ------------------------------------------------------------------------- 

364 # Pages 

365 # ------------------------------------------------------------------------- 

366 

367 def page_with_name(self, page_name: str) -> Optional[SpreadsheetPage]: 

368 """ 

369 Returns the page with the specific name, or ``None`` if no such 

370 page exists. 

371 """ 

372 return next( 

373 (page for page in self.pages if page.name == page_name), None 

374 ) 

375 

376 def add_page(self, page: SpreadsheetPage) -> None: 

377 """ 

378 Adds a new page to our collection. If the new page has the same name 

379 as an existing page, rows from the new page are added to the existing 

380 page. Does nothing if the new page is empty. 

381 """ 

382 if page.empty: 

383 return 

384 existing_page = self.page_with_name(page.name) 

385 if existing_page: 

386 # Blend with existing page 

387 existing_page.add_rows_from_page(page) 

388 else: 

389 # New page 

390 self.pages.append(page) 

391 

392 def add_pages(self, pages: List[SpreadsheetPage]) -> None: 

393 """ 

394 Adds all ``pages`` to our collection, via :func:`add_page`. 

395 """ 

396 for page in pages: 

397 self.add_page(page) 

398 

399 def sort_headings_within_all_pages(self) -> None: 

400 """ 

401 Sort headings within each of our pages. 

402 """ 

403 for page in self.pages: 

404 page.sort_headings() 

405 

406 def sort_pages(self) -> None: 

407 """ 

408 Sort our pages by their page name. 

409 """ 

410 self.pages.sort(key=lambda p: p.name) 

411 

412 def get_page_names(self) -> List[str]: 

413 """ 

414 Return a list of the names of all our pages. 

415 """ 

416 return [p.name for p in self.pages] 

417 

418 def delete_page(self, page_name: str) -> None: 

419 """ 

420 Delete any page with the name specified. 

421 """ 

422 self.pages = [p for p in self.pages if p.name != page_name] 

423 

424 def delete_pages(self, page_names: Container[str]) -> None: 

425 """ 

426 Delete pages with the names specified. 

427 """ 

428 self.pages = [p for p in self.pages if p.name not in page_names] 

429 

430 def delete_columns(self, headings: Container[str]) -> None: 

431 """ 

432 Across all pages, removes columns with the specified heading names. 

433 Used to simplify spreadsheets. 

434 """ 

435 for p in self.pages: 

436 p.delete_columns(headings) 

437 

438 # ------------------------------------------------------------------------- 

439 # TSV 

440 # ------------------------------------------------------------------------- 

441 

442 def get_tsv_file(self, page_name: str) -> str: 

443 """ 

444 Returns a TSV file for a named page. 

445 

446 Raises: 

447 :exc:`AssertionError` if the named page does not exist 

448 

449 """ 

450 page = self.page_with_name(page_name) 

451 assert page is not None, f"No such page with name {page_name}" 

452 return page.get_tsv() 

453 

454 # ------------------------------------------------------------------------- 

455 # ZIP of TSVs 

456 # ------------------------------------------------------------------------- 

457 

458 def write_zip( 

459 self, 

460 file: Union[str, BinaryIO], 

461 encoding: str = "utf-8", 

462 compression: int = zipfile.ZIP_DEFLATED, 

463 ) -> None: 

464 """ 

465 Writes data to a file, as a ZIP file of TSV files. 

466 

467 Args: 

468 file: filename or file-like object 

469 encoding: encoding to use when writing the TSV files 

470 compression: compression method to use 

471 

472 Choice of compression method: see 

473 

474 - https://docs.python.org/3/library/zipfile.html 

475 - https://pkware.cachefly.net/webdocs/casestudies/APPNOTE.TXT 

476 - https://en.wikipedia.org/wiki/Zip_(file_format)#Compression_methods 

477 

478 Note also that ``openpyxl`` uses ``ZIP_DEFLATED``, which seems to be 

479 the most portable if not the best compression. 

480 """ 

481 if isinstance(file, str): # it's a filename 

482 with open(file, "wb") as binaryfile: 

483 return self.write_zip(binaryfile, encoding) # recurse once 

484 with zipfile.ZipFile(file, mode="w", compression=compression) as z: 

485 # Write to ZIP. 

486 # If there are no valid task instances, there'll be no TSV; 

487 # that's OK. 

488 for filename_stem in self.get_page_names(): 

489 tsv_filename = filename_stem + ".tsv" 

490 tsv_contents = self.get_tsv_file(page_name=filename_stem) 

491 z.writestr(tsv_filename, tsv_contents.encode(encoding)) 

492 

493 def as_zip(self, encoding: str = "utf-8") -> bytes: 

494 """ 

495 Returns the TSV collection as a ZIP file containing TSV files. 

496 

497 Args: 

498 encoding: encoding to use when writing the TSV files 

499 """ 

500 with io.BytesIO() as memfile: 

501 self.write_zip(memfile, encoding) 

502 zip_contents = memfile.getvalue() 

503 return zip_contents 

504 

505 # ------------------------------------------------------------------------- 

506 # XLSX, ODS 

507 # ------------------------------------------------------------------------- 

508 

509 def write_xlsx(self, file: Union[str, BinaryIO]) -> None: 

510 """ 

511 Write the contents in XLSX (Excel) format to a file. 

512 

513 Args: 

514 file: filename or file-like object 

515 """ 

516 if XLSX_VIA_PYEXCEL: # use pyexcel_xlsx 

517 data = self._get_pyexcel_data(convert_for_openpyxl) 

518 pyexcel_xlsx.save_data(file, data) 

519 else: # use openpyxl 

520 # Marginal performance gain with write_only. Does not automatically 

521 # add a blank sheet 

522 wb = XLWorkbook(write_only=True) # type: ignore[misc] 

523 valid_name_dict = self.get_pages_with_valid_sheet_names() 

524 for page, title in valid_name_dict.items(): 

525 ws = wb.create_sheet(title=title) 

526 page.write_to_openpyxl_xlsx_worksheet(ws) 

527 wb.save(file) 

528 

529 def as_xlsx(self) -> bytes: 

530 """ 

531 Returns the TSV collection as an XLSX (Excel) file. 

532 """ 

533 with io.BytesIO() as memfile: 

534 self.write_xlsx(memfile) 

535 contents = memfile.getvalue() 

536 return contents 

537 

538 @staticmethod 

539 def get_sheet_title(page: SpreadsheetPage) -> str: 

540 r""" 

541 Returns a worksheet name for a :class:`SpreadsheetPage`. 

542 

543 See ``openpyxl/workbook/child.py``. 

544 

545 - Excel prohibits ``\``, ``*``, ``?``, ``:``, ``/``, ``[``, ``]`` 

546 - LibreOffice also prohibits ``'`` as first or last character but let's 

547 just replace that globally. 

548 """ 

549 title = re.sub(r"[\\*?:/\[\]']", "_", page.name) 

550 

551 if len(title) > 31: 

552 title = f"{title[:28]}..." 

553 

554 return title 

555 

556 def _get_pyexcel_data( 

557 self, converter: Callable[[Any], Any] 

558 ) -> Dict[str, List[List[Any]]]: 

559 """ 

560 Returns data in the format expected by ``pyexcel``, which is an ordered 

561 dictionary mapping sheet names to a list of rows, where each row is a 

562 list of cell values. 

563 """ 

564 data = OrderedDict() 

565 for page in self.pages: 

566 data[self.get_sheet_title(page)] = page.spreadsheetrows(converter) 

567 return data 

568 

569 def write_ods(self, file: Union[str, BinaryIO]) -> None: 

570 """ 

571 Writes an ODS (OpenOffice spreadsheet document) to a file. 

572 

573 Args: 

574 file: filename or file-like object 

575 """ 

576 if ODS_VIA_PYEXCEL: # use pyexcel_ods3 

577 data = self._get_pyexcel_data(convert_for_pyexcel_ods3) 

578 pyexcel_ods3.save_data(file, data) 

579 else: # use odswriter 

580 if isinstance(file, str): # it's a filename 

581 with open(file, "wb") as binaryfile: 

582 return self.write_ods(binaryfile) # recurse once 

583 # noinspection PyCallingNonCallable 

584 with ODSWriter(file) as odsfile: # type: ignore[misc] 

585 valid_name_dict = self.get_pages_with_valid_sheet_names() 

586 for page, title in valid_name_dict.items(): 

587 sheet = odsfile.new_sheet(name=title) 

588 page.write_to_odswriter_ods_worksheet(sheet) 

589 

590 def as_ods(self) -> bytes: 

591 """ 

592 Returns the TSV collection as an ODS (OpenOffice spreadsheet document) 

593 file. 

594 """ 

595 with io.BytesIO() as memfile: 

596 self.write_ods(memfile) 

597 contents = memfile.getvalue() 

598 return contents 

599 

600 def get_pages_with_valid_sheet_names(self) -> Dict[SpreadsheetPage, str]: 

601 """ 

602 Returns an ordered mapping from :class:`SpreadsheetPage` objects to 

603 their sheet names. 

604 """ 

605 name_dict = OrderedDict() 

606 

607 for page in self.pages: 

608 name_dict[page] = self.get_sheet_title(page) 

609 

610 self.make_sheet_names_unique(name_dict) 

611 

612 return name_dict 

613 

614 @staticmethod 

615 def make_sheet_names_unique(name_dict: Dict[SpreadsheetPage, str]) -> None: 

616 """ 

617 Modifies (in place) a mapping from :class:`SpreadsheetPage` to 

618 worksheet names, such that all page names are unique. 

619 

620 - See also :func:`avoid_duplicate_name` in 

621 ``openpxl/workbook/child.py`` 

622 - We keep the 31 character restriction 

623 """ 

624 unique_names = [] # type: List[str] 

625 

626 for page, name in name_dict.items(): 

627 attempt = 0 

628 

629 while name.lower() in unique_names: 

630 attempt += 1 

631 

632 if attempt > 1000: 

633 # algorithm failure, better to let Excel deal with the 

634 # consequences than get stuck in a loop 

635 log.debug( 

636 f"Failed to generate a unique sheet name from {name}" 

637 ) 

638 break 

639 

640 match = re.search(r"\d+$", name) 

641 count = 0 

642 if match is not None: 

643 count = int(match.group()) 

644 

645 new_suffix = str(count + 1) 

646 name = name[: -len(new_suffix)] + new_suffix 

647 name_dict[page] = name 

648 unique_names.append(name.lower()) 

649 

650 # ------------------------------------------------------------------------- 

651 # R 

652 # ------------------------------------------------------------------------- 

653 

654 def as_r(self) -> str: 

655 """ 

656 Returns data as an R script. 

657 

658 This could be more sophisticated, e.g. creating factors with 

659 appropriate levels (etc.). 

660 """ 

661 now = format_datetime( 

662 get_now_localtz_pendulum(), 

663 DateFormat.ISO8601_HUMANIZED_TO_SECONDS_TZ, 

664 ) 

665 table_definition_str = "\n\n".join( 

666 page.r_data_table_definition() for page in self.pages 

667 ) 

668 script = f"""#!/usr/bin/env Rscript 

669 

670# R script generated by CamCOPS at {now} 

671 

672# ============================================================================= 

673# Libraries 

674# ============================================================================= 

675 

676library(data.table) 

677 

678# ============================================================================= 

679# Data 

680# ============================================================================= 

681 

682{table_definition_str} 

683 

684""" 

685 return script 

686 

687 def write_r(self, filename: str, encoding: str = "utf-8") -> None: 

688 """ 

689 Write the contents in R format to a file. 

690 

691 Args: 

692 filename: filename or file-like object 

693 encoding: encoding to use 

694 """ 

695 with open(filename, "wt", encoding=encoding) as f: 

696 f.write(self.as_r()) 

697 

698 

699def _make_benchmarking_collection( 

700 nsheets: int = 100, 

701 nrows: int = 200, 

702 ncols: int = 30, 

703 mindata: int = 0, 

704 maxdata: int = 1000000, 

705) -> SpreadsheetCollection: 

706 log.info( 

707 f"Creating SpreadsheetCollection with nsheets={nsheets}, " 

708 f"nrows={nrows}, ncols={ncols}..." 

709 ) 

710 coll = SpreadsheetCollection() 

711 for sheetnum in range(1, nsheets + 1): 

712 rows = [ 

713 { 

714 f"c{colnum}": str(random.randint(mindata, maxdata)) 

715 for colnum in range(1, ncols + 1) 

716 } 

717 for _ in range(1, nrows + 1) 

718 ] 

719 page = SpreadsheetPage(name=f"sheet{sheetnum}", rows=rows) 

720 coll.add_page(page) 

721 log.info("... done.") 

722 return coll 

723 

724 

725def file_size(filename: str) -> int: 

726 """ 

727 Returns a file's size in bytes. 

728 """ 

729 return os.stat(filename).st_size 

730 

731 

732def benchmark_save( 

733 xlsx_filename: str = "test.xlsx", 

734 ods_filename: str = "test.ods", 

735 tsv_zip_filename: str = "test.zip", 

736 r_filename: str = "test.R", 

737) -> None: 

738 """ 

739 Use with: 

740 

741 .. code-block:: python 

742 

743 from cardinal_pythonlib.logs import main_only_quicksetup_rootlogger 

744 from camcops_server.cc_modules.cc_spreadsheet import benchmark_save 

745 main_only_quicksetup_rootlogger() 

746 benchmark_save() 

747 

748 Args: 

749 xlsx_filename: XLSX file to create 

750 ods_filename: ODS file to create 

751 tsv_zip_filename: TSV ZIP file to create 

752 r_filename: R script to create 

753 

754 Problem in Nov 2019 is that ODS is extremely slow. Rough timings: 

755 

756 - TSV ZIP: about 4.1 Mb, about 0.2 s. Good. 

757 - XLSX (via openpyxl): about 4.6 Mb, 16 seconds. 

758 - XLSX (via pyexcel_xlsx): about 4.6 Mb, 16 seconds. 

759 - ODS (via odswriter): about 53 Mb, 56 seconds. 

760 - ODS (via pyexcel_ods3): about 2.8 Mb, 29 seconds. 

761 """ 

762 coll = _make_benchmarking_collection() 

763 

764 log.info("Writing TSV ZIP...") 

765 coll.write_zip(tsv_zip_filename) 

766 log.info(f"... done. File size {file_size(tsv_zip_filename)}") 

767 

768 log.info("Writing XLSX...") 

769 coll.write_xlsx(xlsx_filename) 

770 log.info(f"... done. File size {file_size(xlsx_filename)}") 

771 

772 log.info("Writing ODS...") 

773 coll.write_ods(ods_filename) 

774 log.info(f"... done. File size {file_size(ods_filename)}") 

775 

776 log.info("Writing R...") 

777 coll.write_r(r_filename) 

778 log.info(f"... done. File size {file_size(r_filename)}")