Coverage for anonymise/researcher_report.py: 66%

270 statements  

« prev     ^ index     » next       coverage.py v7.8.0, created at 2026-01-08 09:05 -0600

1#!/usr/bin/env python 

2 

3""" 

4crate_anon/anonymise/researcher_report.py 

5 

6=============================================================================== 

7 

8 Copyright (C) 2015, University of Cambridge, Department of Psychiatry. 

9 Created by Rudolf Cardinal (rnc1001@cam.ac.uk). 

10 

11 This file is part of CRATE. 

12 

13 CRATE is free software: you can redistribute it and/or modify 

14 it under the terms of the GNU General Public License as published by 

15 the Free Software Foundation, either version 3 of the License, or 

16 (at your option) any later version. 

17 

18 CRATE is distributed in the hope that it will be useful, 

19 but WITHOUT ANY WARRANTY; without even the implied warranty of 

20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

21 GNU General Public License for more details. 

22 

23 You should have received a copy of the GNU General Public License 

24 along with CRATE. If not, see <https://www.gnu.org/licenses/>. 

25 

26=============================================================================== 

27 

28**Produce a researcher-oriented report about a destination database.** 

29 

30""" 

31 

32import argparse 

33from dataclasses import dataclass 

34import datetime 

35import decimal 

36import enum 

37import logging 

38import os 

39from typing import Any, Dict, List, Optional, Tuple 

40 

41from cardinal_pythonlib.datetimefunc import ( 

42 format_datetime, 

43 get_now_localtz_pendulum, 

44 strfdelta, 

45) 

46from cardinal_pythonlib.logs import main_only_quicksetup_rootlogger 

47from cardinal_pythonlib.pdf import make_pdf_on_disk_from_html 

48import django 

49from django.conf import settings 

50from django.template.loader import render_to_string 

51import pendulum 

52from sqlalchemy.engine.url import make_url, URL 

53from sqlalchemy.sql.expression import distinct, func, select, table 

54from sqlalchemy.schema import Column, Table 

55 

56from crate_anon.anonymise.config import Config 

57from crate_anon.anonymise.constants import ANON_CONFIG_ENV_VAR 

58from crate_anon.anonymise.dbholder import DatabaseHolder 

59from crate_anon.anonymise.ddr import DataDictionaryRow, DDRLabels 

60from crate_anon.common.argparse_assist import ( 

61 RawDescriptionArgumentDefaultsRichHelpFormatter, 

62) 

63from crate_anon.common.sql import ReflectedColumnInfo 

64from crate_anon.version import CRATE_VERSION, CRATE_VERSION_PRETTY 

65 

66log = logging.getLogger(__name__) 

67 

68 

69# ============================================================================= 

70# Constants 

71# ============================================================================= 

72 

73 

74THIS_DIR = os.path.abspath(os.path.dirname(__file__)) 

75TEMPLATE_DIR = os.path.join(THIS_DIR, "templates", "researcher_report") 

76 

77 

78class Templates: 

79 """ 

80 Template filenames, within TEMPLATE_DIR. 

81 """ 

82 

83 PDF_FOOTER = "pdf_footer.html" 

84 PDF_HEADER = "pdf_header.html" 

85 REPORT = "report.html" 

86 STYLE = "style.css" 

87 TABLE = "table.html" 

88 

89 

90class DateFormat: 

91 # https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes # noqa: E501 

92 PRETTY = "%a %d %B %Y, %H:%M %z" 

93 # ... e.g. Wed 24 July 2013, 20:04 +0100 

94 DATE = "%Y-%m-%d" # e.g. 2023-07-24 

95 DATETIME = "%Y-%m-%d %H:%M" # e.g. 2023-07-24 20:04 

96 TIME = "%H:%M" # e.g. 20:04 

97 

98 # And one for our custom strfdelta function: 

99 TIMEDELTA = "{D:02}d {H:02}h {M:02}m {S:02}s" 

100 

101 

102class Default: 

103 """ 

104 Default values. 

105 """ 

106 

107 BASE_FONT_SIZE = "11pt" 

108 HEADER_FOOTER_SPACING_MM = 3 

109 # ... always in mm; https://wkhtmltopdf.org/usage/wkhtmltopdf.txt 

110 MAX_DISTINCT_VALUES = 20 

111 MAX_VALUE_LENGTH = 50 

112 ORIENTATION = "landscape" 

113 PAGE_SIZE = "A4" 

114 MARGIN_LEFT_RIGHT = "15mm" 

115 MARGIN_TOP_BOTTOM = "18mm" # see HEADER_FOOTER_SPACING_MM 

116 

117 

118ELLIPSIS = "…" 

119EN_DASH = "–" 

120MINUS = "−" 

121HYPHEN = "-" 

122SINGLE_QUOTE_L = "‘" 

123SINGLE_QUOTE_R = "’" 

124# SINGLE_QUOTE = "'" 

125# TWO_SINGLE_QUOTES = "''" 

126TICK = "✓" 

127# RIGHT_ARROW = "►" 

128 

129 

130# ============================================================================= 

131# Helper classes/functions 

132# ============================================================================= 

133 

134 

135@dataclass 

136class ResearcherReportConfig: 

137 output_filename: str 

138 anonconfig: Config = None 

139 

140 base_font_size: str = Default.BASE_FONT_SIZE 

141 db_name: str = None # overrides that in config 

142 db_url: str = None # overrides that in config 

143 debug_pdf: bool = False 

144 max_distinct_values: int = Default.MAX_DISTINCT_VALUES 

145 max_value_length: int = Default.MAX_VALUE_LENGTH 

146 header_footer_spacing_mm: int = Default.HEADER_FOOTER_SPACING_MM 

147 margin_left_right: str = Default.MARGIN_LEFT_RIGHT 

148 margin_top_bottom: str = Default.MARGIN_TOP_BOTTOM 

149 page_size: str = Default.PAGE_SIZE 

150 orientation: str = Default.ORIENTATION 

151 show_counts: bool = True # count records in each table? 

152 show_url: bool = True # include a sanitised URL for the database 

153 show_values: bool = True # include specimen values/ranges 

154 skip_values_if_too_many: bool = False 

155 use_dd: bool = True # include info from the data dictionary 

156 echo: bool = False # echo SQL 

157 

158 def __post_init__(self) -> None: 

159 # Set up lookups. 

160 anonconfig = self.anonconfig 

161 if anonconfig: 

162 self.annotation_from_colname = { 

163 anonconfig.trid_fieldname: DDRLabels.TRID, 

164 anonconfig.master_research_id_fieldname: DDRLabels.MRID, 

165 anonconfig.research_id_fieldname: DDRLabels.RID, 

166 anonconfig.source_hash_fieldname: DDRLabels.SOURCE_HASH, 

167 } 

168 

169 # Set up DD 

170 if self.use_dd: 

171 anonconfig.load_dd(check_against_source_db=False) 

172 else: 

173 self.use_dd = False 

174 

175 # Set up database 

176 if self.db_url: 

177 # Use a custom database 

178 if not self.db_name: 

179 raise ValueError( 

180 "Must specify database name if passing a custom URL" 

181 ) 

182 self.db = DatabaseHolder( 

183 self.db_name, 

184 self.db_url, 

185 with_session=True, 

186 reflect=True, 

187 echo=self.echo, 

188 ) 

189 else: 

190 # Use destination database from the config 

191 if not anonconfig: 

192 raise ValueError( 

193 "Must specify a CRATE anonymisation config file if you " 

194 "do not specify a database by URL/name" 

195 ) 

196 self.db = anonconfig.destdb 

197 self.db.engine.echo = self.echo 

198 self.db.enable_reflect() 

199 self.db.create_session() 

200 self.db_name = self.db_name or anonconfig.destdb.name 

201 self.db_url = self.db.engine.url 

202 

203 self.db_session = self.db.session 

204 

205 def safe_db_url_if_selected(self) -> str: 

206 """ 

207 Sanitised version of the database URL, or a blank string if not 

208 enabled. 

209 """ 

210 if not self.show_url or not self.db_url: 

211 return "" 

212 url_obj = make_url(self.db_url) # type: URL 

213 return repr(url_obj) 

214 # For SQLAlchemy URL objects, the default str() implementation calls 

215 # self.__to_string__(hide_password=False), but the default repr() hides 

216 # passwords. 

217 

218 def wkhtmltopdf_options(self) -> Dict[str, Optional[str]]: 

219 """ 

220 Returns wkhtmltopdf options for the current setup. 

221 """ 

222 return { # dict for pdfkit 

223 "page-size": self.page_size, 

224 "margin-left": self.margin_left_right, 

225 "margin-right": self.margin_left_right, 

226 "margin-top": self.margin_top_bottom, 

227 "margin-bottom": self.margin_top_bottom, 

228 "header-spacing": str(self.header_footer_spacing_mm), 

229 "footer-spacing": str(self.header_footer_spacing_mm), 

230 # "--print-media-type": None 

231 # ... https://stackoverflow.com/q/42005819 

232 "orientation": self.orientation, 

233 } 

234 

235 def get_db_name(self) -> str: 

236 """ 

237 Returns a short database name used for titles. 

238 """ 

239 return self.db_name 

240 

241 def get_db_engine_type(self) -> str: 

242 """ 

243 Returns the engine type (e.g. mysql). 

244 """ 

245 return self.db.engine.name 

246 

247 def get_annotation_when_no_ddr_found(self, col_name: str) -> str: 

248 """ 

249 Returns best-guess CRATE annotation information when no data dictionary 

250 row is available. 

251 

252 Args: 

253 col_name: 

254 Column name. 

255 """ 

256 return self.annotation_from_colname.get(col_name, DDRLabels.UNKNOWN) 

257 

258 

259def template(filename: str) -> str: 

260 """ 

261 Returns a filename from our specific template directory. 

262 """ 

263 return os.path.join(TEMPLATE_DIR, filename) 

264 

265 

266def mk_comment( 

267 reportcfg: ResearcherReportConfig, 

268 column: Column, 

269 ddr: DataDictionaryRow = None, 

270) -> str: 

271 """ 

272 Return a comment. For databases that don't support comments, we'll want the 

273 CRATE DD one (unless that's been disabled). For databases that do, we don't 

274 want duplication. 

275 """ 

276 col_comment = column.comment or "" 

277 if not reportcfg.use_dd: 

278 return col_comment or EN_DASH 

279 dd_comment = (ddr.comment or "") if ddr else "" 

280 if not col_comment and not dd_comment: 

281 return EN_DASH 

282 if dd_comment in col_comment: # within, or equals 

283 return col_comment 

284 if col_comment in dd_comment: 

285 return dd_comment 

286 return f"[DB] {col_comment} [DD] {dd_comment}" 

287 

288 

289def literal( 

290 value: Any, 

291 max_length: int = Default.MAX_VALUE_LENGTH, 

292 truncated_suffix: str = ELLIPSIS, 

293) -> str: 

294 """ 

295 Returns a rough-and-ready SQL literal, intended for human viewing only. 

296 Truncates long strings at a given length. 

297 

298 - Some duplication from within 

299 cardinal_pythonlib.sqlalchemy.dump.get_literal_query. 

300 - Dates/times are NOT enclosed in quotes here. 

301 """ 

302 if value is None: 

303 return "NULL" 

304 elif isinstance(value, str): 

305 length = len(value) 

306 if length > max_length: 

307 value = value[:max_length] 

308 suffix = truncated_suffix + SINGLE_QUOTE_R + f" [length {length}]" 

309 else: 

310 suffix = SINGLE_QUOTE_R 

311 # We won't escape quotes. This report is about visual ease, not 

312 # electronic exactness. 

313 return SINGLE_QUOTE_L + value + suffix 

314 elif isinstance(value, (float, int)): 

315 return repr(value).replace(HYPHEN, MINUS) 

316 elif isinstance(value, decimal.Decimal): 

317 return str(value).replace(HYPHEN, MINUS) 

318 elif isinstance(value, datetime.datetime) or isinstance( 

319 value, pendulum.DateTime 

320 ): 

321 return value.strftime(DateFormat.DATETIME) 

322 elif isinstance(value, datetime.date) or isinstance(value, pendulum.Date): 

323 return value.strftime(DateFormat.DATE) 

324 elif isinstance(value, datetime.time) or isinstance(value, pendulum.Time): 

325 return value.strftime(DateFormat.TIME) 

326 elif isinstance(value, bytes): 

327 return f"<binary_length_{len(value)}>" 

328 elif isinstance(value, datetime.timedelta): 

329 return strfdelta(value, fmt=DateFormat.TIMEDELTA) 

330 elif isinstance(value, enum.Enum): 

331 return f"{value.name} ({value.value})" 

332 else: 

333 raise NotImplementedError( 

334 f"Don't know how to represent value {value!r}" 

335 ) 

336 

337 

338def sorter(x: Any) -> Tuple[bool, Any]: 

339 """ 

340 Used for sorting values that may be None/NULL. Remember that False < True, 

341 so this puts None values lowest (first in a default sort). 

342 """ 

343 return x is not None, x 

344 

345 

346# ============================================================================= 

347# Researcher report about destination database 

348# ============================================================================= 

349 

350 

351def get_values_summary( 

352 column: Column, 

353 reportcfg: ResearcherReportConfig, 

354 ddr: DataDictionaryRow = None, 

355) -> str: 

356 """ 

357 Return a textual summary of values in a column (from a de-identified 

358 database). 

359 

360 Args: 

361 column: 

362 SQLAlchemy Column object to summarize. (It knows its own Table.) 

363 reportcfg: 

364 ResearcherReportConfig object, governing the report. 

365 ddr: 

366 Corresponding CRATE DataDictionaryRow, if there is one. 

367 """ 

368 if not reportcfg.show_values: 

369 # Don't show anything. 

370 return EN_DASH 

371 

372 # Otherwise, we can always do the number of distinct values: 

373 items = [] # type: List[str] 

374 session = reportcfg.db_session 

375 n_distinct_notnull = session.execute( 

376 select(func.count(distinct(column))) 

377 ).fetchone()[0] 

378 # This does NOT include NULL values, by the SQL standard. 

379 suffix = "" if n_distinct_notnull == 1 else "s" # "value" or "values"? 

380 items.append(f"{n_distinct_notnull} distinct non-null value{suffix}.") 

381 

382 show_min_max = False 

383 show_distinct = False # show the actual distinct values? 

384 

385 empty = n_distinct_notnull == 0 

386 sensitive = ( 

387 not empty 

388 and ddr 

389 and ( 

390 ddr.contains_patient_info 

391 or ddr.contains_third_party_info 

392 or ddr.contains_scrub_src 

393 or ddr.being_scrubbed 

394 ) 

395 ) 

396 # ... not *actually* sensitive; merely having the appearance of being 

397 # sensitive for a general-purpose report. 

398 dull = ( 

399 not empty 

400 and not sensitive 

401 and reportcfg.use_dd 

402 and not ddr 

403 and column.name in reportcfg.annotation_from_colname.keys() 

404 ) 

405 

406 if not (empty or sensitive or dull): 

407 # Show some more detail. 

408 if n_distinct_notnull > 1: 

409 show_min_max = True 

410 if ( 

411 n_distinct_notnull <= reportcfg.max_distinct_values 

412 or not reportcfg.skip_values_if_too_many 

413 ): 

414 show_distinct = True 

415 

416 def lit(value: Any) -> str: 

417 return literal(value, reportcfg.max_value_length) 

418 

419 if show_min_max: 

420 min_val, max_val = session.execute( 

421 select(func.min(column), func.max(column)) 

422 ).fetchone() 

423 items.append(f"Min {lit(min_val)}; max {lit(max_val)}.") 

424 

425 if show_distinct: 

426 dv_rows = session.execute( 

427 select(column) 

428 .distinct() 

429 .order_by(column) 

430 .limit(reportcfg.max_value_length + 1) 

431 ).fetchall() 

432 # These WILL include any NULL values, so there may be one more than 

433 # n_distinct_notnull (or the same, if there are no NULLs). The only 

434 # way to be sure if we are truncating, therefore, and to show a 

435 # truncation indicator, is to fetch up to one more and see if we are 

436 # over the limit. 

437 # Sort before literal (so we get numeric, not string, sort): 

438 distinct_values = sorted((row[0] for row in dv_rows), key=sorter) 

439 distinct_value_elements = [lit(v) for v in distinct_values] 

440 if len(distinct_values) > reportcfg.max_distinct_values: 

441 distinct_value_elements = distinct_value_elements[ 

442 0 : reportcfg.max_distinct_values 

443 ] + [ELLIPSIS] 

444 distinct_value_str = ", ".join(distinct_value_elements) 

445 items.append(f"Distinct values: {{{distinct_value_str}}}.") 

446 # It's a set, so use set notation. 

447 

448 return " ".join(items) 

449 

450 

451def mk_table_html(table_name: str, reportcfg: ResearcherReportConfig) -> str: 

452 """ 

453 Returns HTML for the per-table aspects of the report. 

454 

455 Args: 

456 table_name: 

457 Table to process. 

458 reportcfg: 

459 ResearcherReportConfig object, governing the report. 

460 

461 Returns: 

462 HTML as a string. 

463 """ 

464 log.info(f"Processing table: {table_name}") 

465 dest_ddr_rows = ( 

466 reportcfg.anonconfig.dd.get_rows_for_dest_table(table_name) 

467 if reportcfg.use_dd 

468 else [] 

469 ) 

470 session = reportcfg.db_session 

471 

472 n_rows = ( 

473 session.execute( 

474 select(func.count()).select_from(table(table_name)) 

475 ).fetchone()[0] 

476 if reportcfg.show_counts 

477 else None 

478 ) 

479 # Rows versus records: https://dba.stackexchange.com/questions/31805/ 

480 

481 t = reportcfg.db.metadata.tables[table_name] # type: Table 

482 table_comment = t.comment or "" # may be blank 

483 columns = [] # type: List[ReflectedColumnInfo] 

484 for c in sorted(t.c, key=lambda x: x.name): # type: Column 

485 log.debug(repr(c)) 

486 colname = c.name 

487 if reportcfg.use_dd: 

488 try: 

489 ddr = next(x for x in dest_ddr_rows if x.dest_field == colname) 

490 crate_annotation = ddr.report_dest_annotation() 

491 except StopIteration: 

492 ddr = None 

493 crate_annotation = reportcfg.get_annotation_when_no_ddr_found( 

494 col_name=colname 

495 ) 

496 else: 

497 ddr = None 

498 crate_annotation = "" 

499 values_info = get_values_summary( 

500 column=c, 

501 reportcfg=reportcfg, 

502 ddr=ddr, 

503 ) 

504 columns.append( 

505 ReflectedColumnInfo( 

506 column=c, 

507 override_comment=mk_comment(reportcfg, c, ddr), 

508 crate_annotation=crate_annotation, 

509 values_info=values_info, 

510 ) 

511 ) 

512 

513 return render_to_string( 

514 template(Templates.TABLE), 

515 dict( 

516 columns=columns, 

517 n_rows=n_rows, 

518 show_counts=reportcfg.show_counts, 

519 show_values=reportcfg.show_values, 

520 table_comment=table_comment, 

521 table_name=table_name, 

522 use_dd=reportcfg.use_dd, 

523 ), 

524 ) 

525 

526 

527def mk_researcher_report_html( 

528 reportcfg: ResearcherReportConfig, 

529) -> Tuple[str, str, str]: 

530 """ 

531 Produces a researcher-oriented report about a destination database, as 

532 HTML. 

533 

534 Args: 

535 reportcfg: 

536 ResearcherReportConfig object, governing the report. 

537 

538 Returns: 

539 tuple: header_html, html, footer_html 

540 """ 

541 # ------------------------------------------------------------------------- 

542 # 1. Set up Django for templates. 

543 # ------------------------------------------------------------------------- 

544 # https://stackoverflow.com/questions/28123603 

545 if not settings.configured: 

546 # Settings will already be configured when testing with pytest 

547 settings.configure( 

548 TEMPLATES=[ 

549 { 

550 "BACKEND": ( 

551 "django.template.backends.django.DjangoTemplates" 

552 ), 

553 "DIRS": [TEMPLATE_DIR], 

554 } 

555 ] 

556 ) 

557 django.setup() 

558 

559 # ------------------------------------------------------------------------- 

560 # 2. Core variables 

561 # ------------------------------------------------------------------------- 

562 db_name = reportcfg.get_db_name() 

563 now = format_datetime(get_now_localtz_pendulum(), DateFormat.PRETTY) 

564 title = f"{db_name}: CRATE researcher report, {now}" 

565 css = render_to_string( 

566 template(Templates.STYLE), 

567 dict(base_font_size=reportcfg.base_font_size), 

568 ) 

569 coredict = dict(title=title, css=css, now=now) 

570 

571 # ------------------------------------------------------------------------- 

572 # 3. Read header/footer (e.g. for PDF page numbers). 

573 # ------------------------------------------------------------------------- 

574 header_html = render_to_string(template(Templates.PDF_HEADER), coredict) 

575 footer_html = render_to_string(template(Templates.PDF_FOOTER), coredict) 

576 

577 # ------------------------------------------------------------------------- 

578 # 4. Scan the database. 

579 # ------------------------------------------------------------------------- 

580 table_names = sorted(reportcfg.db.table_names) # reflects (introspects) 

581 

582 # ------------------------------------------------------------------------- 

583 # 5. Generate our main report. 

584 # ------------------------------------------------------------------------- 

585 table_html_list = [ 

586 mk_table_html(table_name, reportcfg) for table_name in table_names 

587 ] 

588 html = render_to_string( 

589 template(Templates.REPORT), 

590 dict( 

591 CRATE_VERSION=CRATE_VERSION, 

592 db_engine=reportcfg.get_db_engine_type(), 

593 db_name=db_name, 

594 n_tables=len(table_names), 

595 table_names=table_names, 

596 tables_html="".join(table_html_list), 

597 url=reportcfg.safe_db_url_if_selected(), 

598 **coredict, 

599 ), 

600 ) 

601 

602 # ------------------------------------------------------------------------- 

603 # 6. Return HTML components. 

604 # ------------------------------------------------------------------------- 

605 return header_html, html, footer_html 

606 

607 

608def mk_researcher_report_pdf( 

609 reportcfg: ResearcherReportConfig, 

610) -> bool: 

611 """ 

612 Produces a researcher-oriented report about a destination database, as a 

613 PDF. 

614 

615 Args: 

616 reportcfg: 

617 ResearcherReportConfig object, governing the report. 

618 

619 Returns: 

620 success 

621 """ 

622 header_html, html, footer_html = mk_researcher_report_html(reportcfg) 

623 log.info(f"Writing to {reportcfg.output_filename}") 

624 return make_pdf_on_disk_from_html( 

625 html=html, 

626 output_path=reportcfg.output_filename, 

627 header_html=header_html, 

628 footer_html=footer_html, 

629 wkhtmltopdf_options=reportcfg.wkhtmltopdf_options(), 

630 debug_options=reportcfg.debug_pdf, 

631 debug_content=reportcfg.debug_pdf, 

632 debug_wkhtmltopdf_args=reportcfg.debug_pdf, 

633 ) 

634 

635 

636# ============================================================================= 

637# Main 

638# ============================================================================= 

639 

640 

641def main() -> None: 

642 """ 

643 Command-line entry point. 

644 """ 

645 # noinspection PyTypeChecker 

646 parser = argparse.ArgumentParser( 

647 description=f""" 

648Produce a researcher-oriented PDF report about a destination database. 

649({CRATE_VERSION_PRETTY}) 

650 

651Note: if wkhtmtopdf reports 'Too many open files', see 

652- https://stackoverflow.com/q/25355697; 

653- https://github.com/wkhtmltopdf/wkhtmltopdf/issues/3081; 

654setting e.g. "ulimit -n 2048" is one solution. 

655 

656""", 

657 formatter_class=RawDescriptionArgumentDefaultsRichHelpFormatter, 

658 ) 

659 

660 parser.add_argument("output", help="PDF output filename") 

661 

662 grp_db = parser.add_argument_group("DATABASE") 

663 grp_db.add_argument( 

664 "--config", 

665 help=f"CRATE anonymisation config file, overriding environment " 

666 f"variable {ANON_CONFIG_ENV_VAR}", 

667 ) 

668 grp_db.add_argument( 

669 "--noconfig", 

670 action="store_true", 

671 help="Do not use a config file (unusual)", 

672 ) 

673 grp_db.add_argument( 

674 "--db_url", 

675 type=str, 

676 default=None, 

677 help="Database URL, overriding that in the config file", 

678 ) 

679 grp_db.add_argument( 

680 "--db_name", 

681 type=str, 

682 default=None, 

683 help="Database name, overriding that in the config file; must be " 

684 "specified if you use --db_url", 

685 ) 

686 

687 grp_detail = parser.add_argument_group("DETAIL") 

688 grp_detail.add_argument( 

689 "--show_url", 

690 dest="show_url", 

691 action="store_true", 

692 default=False, 

693 help="Include sanitised, password-safe version of database URL", 

694 ) 

695 grp_detail.add_argument( 

696 "--no_show_url", 

697 dest="show_url", 

698 action="store_false", 

699 default=True, 

700 help="Do not include database URL", 

701 ) 

702 grp_detail.add_argument( 

703 "--show_counts", 

704 dest="show_counts", 

705 action="store_true", 

706 default=True, 

707 help="Include row counts for each table", 

708 ) 

709 grp_detail.add_argument( 

710 "--no_show_counts", 

711 dest="show_counts", 

712 action="store_false", 

713 default=False, 

714 help="Do not include row counts", 

715 ) 

716 grp_detail.add_argument( 

717 "--use_dd", 

718 dest="use_dd", 

719 action="store_true", 

720 default=True, 

721 help="Use information obtainable from the CRATE data dictionary (DD), " 

722 "including comments, annotations, and value suppression for " 

723 "potentially sensitive fields; only sensible for reporting on a " 

724 "database completely unrelated to the DD", 

725 ) 

726 grp_detail.add_argument( 

727 "--no_use_dd", 

728 dest="use_dd", 

729 action="store_false", 

730 default=False, 

731 help="Do not use information from the CRATE data dictionary", 

732 ) 

733 grp_detail.add_argument( 

734 "--show_values", 

735 dest="show_values", 

736 action="store_true", 

737 default=True, 

738 help="Include specimen values/ranges", 

739 ) 

740 grp_detail.add_argument( 

741 "--no_show_values", 

742 dest="show_values", 

743 action="store_false", 

744 default=False, 

745 help="Do not include specimen values/ranges", 

746 ) 

747 grp_detail.add_argument( 

748 "--max_distinct_values", 

749 type=int, 

750 default=Default.MAX_DISTINCT_VALUES, 

751 help="Maximum number of distinct values to show, if applicable", 

752 ) 

753 grp_detail.add_argument( 

754 "--skip_values_if_too_many", 

755 action="store_true", 

756 help="If showing values, and there are more distinct values than the " 

757 "maximum, omit them (rather than showing the first few)?", 

758 ) 

759 grp_detail.add_argument( 

760 "--max_value_length", 

761 type=int, 

762 default=Default.MAX_VALUE_LENGTH, 

763 help="Maximum string length to show for a literal value", 

764 ) 

765 

766 grp_visuals = parser.add_argument_group("VISUALS") 

767 grp_visuals.add_argument( 

768 "--page_size", 

769 default=Default.PAGE_SIZE, 

770 help="Page size, i.e. paper type", 

771 ) 

772 grp_visuals.add_argument( 

773 "--margin_left_right", 

774 default=Default.MARGIN_LEFT_RIGHT, 

775 help="Page left/right margins, with units", 

776 ) 

777 grp_visuals.add_argument( 

778 "--margin_top_bottom", 

779 default=Default.MARGIN_TOP_BOTTOM, 

780 help="Page top/bottom margins for content, ignoring header/footer " 

781 "(see --header_footer_spacing_mm), with units", 

782 ) 

783 grp_visuals.add_argument( 

784 "--header_footer_spacing_mm", 

785 type=int, 

786 default=Default.HEADER_FOOTER_SPACING_MM, 

787 help="Gap between content and header/footer, in mm", 

788 ) 

789 grp_visuals.add_argument( 

790 "--orientation", 

791 choices=["portrait", "landscape"], 

792 default=Default.ORIENTATION, 

793 help="Page orientation", 

794 ) 

795 grp_visuals.add_argument( 

796 "--base_font_size", 

797 default=Default.BASE_FONT_SIZE, 

798 help="Base font size, with units", 

799 ) 

800 

801 grp_progress = parser.add_argument_group("PROGRESS") 

802 grp_progress.add_argument( 

803 "--verbose", "-v", action="store_true", help="Be verbose" 

804 ) 

805 grp_progress.add_argument( 

806 "--debug_pdf", action="store_true", help="Debug PDF creation" 

807 ) 

808 

809 args = parser.parse_args() 

810 

811 # ------------------------------------------------------------------------- 

812 # Verbosity, logging 

813 # ------------------------------------------------------------------------- 

814 

815 loglevel = logging.DEBUG if args.verbose else logging.INFO 

816 main_only_quicksetup_rootlogger(level=loglevel) 

817 

818 # ------------------------------------------------------------------------- 

819 # Onwards 

820 # ------------------------------------------------------------------------- 

821 

822 if args.config: 

823 os.environ[ANON_CONFIG_ENV_VAR] = args.config 

824 if args.noconfig: 

825 log.info("Not using a CRATE anonymisation config file") 

826 config = None 

827 else: 

828 from crate_anon.anonymise.config_singleton import ( 

829 config, 

830 ) 

831 

832 reportcfg = ResearcherReportConfig( 

833 anonconfig=config, 

834 base_font_size=args.base_font_size, 

835 db_name=args.db_name, 

836 db_url=args.db_url, 

837 debug_pdf=args.debug_pdf, 

838 header_footer_spacing_mm=args.header_footer_spacing_mm, 

839 margin_left_right=args.margin_left_right, 

840 margin_top_bottom=args.margin_top_bottom, 

841 max_distinct_values=args.max_distinct_values, 

842 max_value_length=args.max_value_length, 

843 orientation=args.orientation, 

844 output_filename=args.output, 

845 page_size=args.page_size, 

846 show_counts=args.show_counts, 

847 show_url=args.show_url, 

848 show_values=args.show_values, 

849 skip_values_if_too_many=args.skip_values_if_too_many, 

850 use_dd=args.use_dd, 

851 ) 

852 

853 mk_researcher_report_pdf(reportcfg) 

854 

855 

856if __name__ == "__main__": 

857 main()