Coverage for anonymise/researcher_report.py: 66%
270 statements
« prev ^ index » next coverage.py v7.8.0, created at 2026-01-08 09:05 -0600
« prev ^ index » next coverage.py v7.8.0, created at 2026-01-08 09:05 -0600
1#!/usr/bin/env python
3"""
4crate_anon/anonymise/researcher_report.py
6===============================================================================
8 Copyright (C) 2015, University of Cambridge, Department of Psychiatry.
9 Created by Rudolf Cardinal (rnc1001@cam.ac.uk).
11 This file is part of CRATE.
13 CRATE is free software: you can redistribute it and/or modify
14 it under the terms of the GNU General Public License as published by
15 the Free Software Foundation, either version 3 of the License, or
16 (at your option) any later version.
18 CRATE is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 GNU General Public License for more details.
23 You should have received a copy of the GNU General Public License
24 along with CRATE. If not, see <https://www.gnu.org/licenses/>.
26===============================================================================
28**Produce a researcher-oriented report about a destination database.**
30"""
32import argparse
33from dataclasses import dataclass
34import datetime
35import decimal
36import enum
37import logging
38import os
39from typing import Any, Dict, List, Optional, Tuple
41from cardinal_pythonlib.datetimefunc import (
42 format_datetime,
43 get_now_localtz_pendulum,
44 strfdelta,
45)
46from cardinal_pythonlib.logs import main_only_quicksetup_rootlogger
47from cardinal_pythonlib.pdf import make_pdf_on_disk_from_html
48import django
49from django.conf import settings
50from django.template.loader import render_to_string
51import pendulum
52from sqlalchemy.engine.url import make_url, URL
53from sqlalchemy.sql.expression import distinct, func, select, table
54from sqlalchemy.schema import Column, Table
56from crate_anon.anonymise.config import Config
57from crate_anon.anonymise.constants import ANON_CONFIG_ENV_VAR
58from crate_anon.anonymise.dbholder import DatabaseHolder
59from crate_anon.anonymise.ddr import DataDictionaryRow, DDRLabels
60from crate_anon.common.argparse_assist import (
61 RawDescriptionArgumentDefaultsRichHelpFormatter,
62)
63from crate_anon.common.sql import ReflectedColumnInfo
64from crate_anon.version import CRATE_VERSION, CRATE_VERSION_PRETTY
66log = logging.getLogger(__name__)
69# =============================================================================
70# Constants
71# =============================================================================
74THIS_DIR = os.path.abspath(os.path.dirname(__file__))
75TEMPLATE_DIR = os.path.join(THIS_DIR, "templates", "researcher_report")
78class Templates:
79 """
80 Template filenames, within TEMPLATE_DIR.
81 """
83 PDF_FOOTER = "pdf_footer.html"
84 PDF_HEADER = "pdf_header.html"
85 REPORT = "report.html"
86 STYLE = "style.css"
87 TABLE = "table.html"
90class DateFormat:
91 # https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes # noqa: E501
92 PRETTY = "%a %d %B %Y, %H:%M %z"
93 # ... e.g. Wed 24 July 2013, 20:04 +0100
94 DATE = "%Y-%m-%d" # e.g. 2023-07-24
95 DATETIME = "%Y-%m-%d %H:%M" # e.g. 2023-07-24 20:04
96 TIME = "%H:%M" # e.g. 20:04
98 # And one for our custom strfdelta function:
99 TIMEDELTA = "{D:02}d {H:02}h {M:02}m {S:02}s"
102class Default:
103 """
104 Default values.
105 """
107 BASE_FONT_SIZE = "11pt"
108 HEADER_FOOTER_SPACING_MM = 3
109 # ... always in mm; https://wkhtmltopdf.org/usage/wkhtmltopdf.txt
110 MAX_DISTINCT_VALUES = 20
111 MAX_VALUE_LENGTH = 50
112 ORIENTATION = "landscape"
113 PAGE_SIZE = "A4"
114 MARGIN_LEFT_RIGHT = "15mm"
115 MARGIN_TOP_BOTTOM = "18mm" # see HEADER_FOOTER_SPACING_MM
118ELLIPSIS = "…"
119EN_DASH = "–"
120MINUS = "−"
121HYPHEN = "-"
122SINGLE_QUOTE_L = "‘"
123SINGLE_QUOTE_R = "’"
124# SINGLE_QUOTE = "'"
125# TWO_SINGLE_QUOTES = "''"
126TICK = "✓"
127# RIGHT_ARROW = "►"
130# =============================================================================
131# Helper classes/functions
132# =============================================================================
135@dataclass
136class ResearcherReportConfig:
137 output_filename: str
138 anonconfig: Config = None
140 base_font_size: str = Default.BASE_FONT_SIZE
141 db_name: str = None # overrides that in config
142 db_url: str = None # overrides that in config
143 debug_pdf: bool = False
144 max_distinct_values: int = Default.MAX_DISTINCT_VALUES
145 max_value_length: int = Default.MAX_VALUE_LENGTH
146 header_footer_spacing_mm: int = Default.HEADER_FOOTER_SPACING_MM
147 margin_left_right: str = Default.MARGIN_LEFT_RIGHT
148 margin_top_bottom: str = Default.MARGIN_TOP_BOTTOM
149 page_size: str = Default.PAGE_SIZE
150 orientation: str = Default.ORIENTATION
151 show_counts: bool = True # count records in each table?
152 show_url: bool = True # include a sanitised URL for the database
153 show_values: bool = True # include specimen values/ranges
154 skip_values_if_too_many: bool = False
155 use_dd: bool = True # include info from the data dictionary
156 echo: bool = False # echo SQL
158 def __post_init__(self) -> None:
159 # Set up lookups.
160 anonconfig = self.anonconfig
161 if anonconfig:
162 self.annotation_from_colname = {
163 anonconfig.trid_fieldname: DDRLabels.TRID,
164 anonconfig.master_research_id_fieldname: DDRLabels.MRID,
165 anonconfig.research_id_fieldname: DDRLabels.RID,
166 anonconfig.source_hash_fieldname: DDRLabels.SOURCE_HASH,
167 }
169 # Set up DD
170 if self.use_dd:
171 anonconfig.load_dd(check_against_source_db=False)
172 else:
173 self.use_dd = False
175 # Set up database
176 if self.db_url:
177 # Use a custom database
178 if not self.db_name:
179 raise ValueError(
180 "Must specify database name if passing a custom URL"
181 )
182 self.db = DatabaseHolder(
183 self.db_name,
184 self.db_url,
185 with_session=True,
186 reflect=True,
187 echo=self.echo,
188 )
189 else:
190 # Use destination database from the config
191 if not anonconfig:
192 raise ValueError(
193 "Must specify a CRATE anonymisation config file if you "
194 "do not specify a database by URL/name"
195 )
196 self.db = anonconfig.destdb
197 self.db.engine.echo = self.echo
198 self.db.enable_reflect()
199 self.db.create_session()
200 self.db_name = self.db_name or anonconfig.destdb.name
201 self.db_url = self.db.engine.url
203 self.db_session = self.db.session
205 def safe_db_url_if_selected(self) -> str:
206 """
207 Sanitised version of the database URL, or a blank string if not
208 enabled.
209 """
210 if not self.show_url or not self.db_url:
211 return ""
212 url_obj = make_url(self.db_url) # type: URL
213 return repr(url_obj)
214 # For SQLAlchemy URL objects, the default str() implementation calls
215 # self.__to_string__(hide_password=False), but the default repr() hides
216 # passwords.
218 def wkhtmltopdf_options(self) -> Dict[str, Optional[str]]:
219 """
220 Returns wkhtmltopdf options for the current setup.
221 """
222 return { # dict for pdfkit
223 "page-size": self.page_size,
224 "margin-left": self.margin_left_right,
225 "margin-right": self.margin_left_right,
226 "margin-top": self.margin_top_bottom,
227 "margin-bottom": self.margin_top_bottom,
228 "header-spacing": str(self.header_footer_spacing_mm),
229 "footer-spacing": str(self.header_footer_spacing_mm),
230 # "--print-media-type": None
231 # ... https://stackoverflow.com/q/42005819
232 "orientation": self.orientation,
233 }
235 def get_db_name(self) -> str:
236 """
237 Returns a short database name used for titles.
238 """
239 return self.db_name
241 def get_db_engine_type(self) -> str:
242 """
243 Returns the engine type (e.g. mysql).
244 """
245 return self.db.engine.name
247 def get_annotation_when_no_ddr_found(self, col_name: str) -> str:
248 """
249 Returns best-guess CRATE annotation information when no data dictionary
250 row is available.
252 Args:
253 col_name:
254 Column name.
255 """
256 return self.annotation_from_colname.get(col_name, DDRLabels.UNKNOWN)
259def template(filename: str) -> str:
260 """
261 Returns a filename from our specific template directory.
262 """
263 return os.path.join(TEMPLATE_DIR, filename)
266def mk_comment(
267 reportcfg: ResearcherReportConfig,
268 column: Column,
269 ddr: DataDictionaryRow = None,
270) -> str:
271 """
272 Return a comment. For databases that don't support comments, we'll want the
273 CRATE DD one (unless that's been disabled). For databases that do, we don't
274 want duplication.
275 """
276 col_comment = column.comment or ""
277 if not reportcfg.use_dd:
278 return col_comment or EN_DASH
279 dd_comment = (ddr.comment or "") if ddr else ""
280 if not col_comment and not dd_comment:
281 return EN_DASH
282 if dd_comment in col_comment: # within, or equals
283 return col_comment
284 if col_comment in dd_comment:
285 return dd_comment
286 return f"[DB] {col_comment} [DD] {dd_comment}"
289def literal(
290 value: Any,
291 max_length: int = Default.MAX_VALUE_LENGTH,
292 truncated_suffix: str = ELLIPSIS,
293) -> str:
294 """
295 Returns a rough-and-ready SQL literal, intended for human viewing only.
296 Truncates long strings at a given length.
298 - Some duplication from within
299 cardinal_pythonlib.sqlalchemy.dump.get_literal_query.
300 - Dates/times are NOT enclosed in quotes here.
301 """
302 if value is None:
303 return "NULL"
304 elif isinstance(value, str):
305 length = len(value)
306 if length > max_length:
307 value = value[:max_length]
308 suffix = truncated_suffix + SINGLE_QUOTE_R + f" [length {length}]"
309 else:
310 suffix = SINGLE_QUOTE_R
311 # We won't escape quotes. This report is about visual ease, not
312 # electronic exactness.
313 return SINGLE_QUOTE_L + value + suffix
314 elif isinstance(value, (float, int)):
315 return repr(value).replace(HYPHEN, MINUS)
316 elif isinstance(value, decimal.Decimal):
317 return str(value).replace(HYPHEN, MINUS)
318 elif isinstance(value, datetime.datetime) or isinstance(
319 value, pendulum.DateTime
320 ):
321 return value.strftime(DateFormat.DATETIME)
322 elif isinstance(value, datetime.date) or isinstance(value, pendulum.Date):
323 return value.strftime(DateFormat.DATE)
324 elif isinstance(value, datetime.time) or isinstance(value, pendulum.Time):
325 return value.strftime(DateFormat.TIME)
326 elif isinstance(value, bytes):
327 return f"<binary_length_{len(value)}>"
328 elif isinstance(value, datetime.timedelta):
329 return strfdelta(value, fmt=DateFormat.TIMEDELTA)
330 elif isinstance(value, enum.Enum):
331 return f"{value.name} ({value.value})"
332 else:
333 raise NotImplementedError(
334 f"Don't know how to represent value {value!r}"
335 )
338def sorter(x: Any) -> Tuple[bool, Any]:
339 """
340 Used for sorting values that may be None/NULL. Remember that False < True,
341 so this puts None values lowest (first in a default sort).
342 """
343 return x is not None, x
346# =============================================================================
347# Researcher report about destination database
348# =============================================================================
351def get_values_summary(
352 column: Column,
353 reportcfg: ResearcherReportConfig,
354 ddr: DataDictionaryRow = None,
355) -> str:
356 """
357 Return a textual summary of values in a column (from a de-identified
358 database).
360 Args:
361 column:
362 SQLAlchemy Column object to summarize. (It knows its own Table.)
363 reportcfg:
364 ResearcherReportConfig object, governing the report.
365 ddr:
366 Corresponding CRATE DataDictionaryRow, if there is one.
367 """
368 if not reportcfg.show_values:
369 # Don't show anything.
370 return EN_DASH
372 # Otherwise, we can always do the number of distinct values:
373 items = [] # type: List[str]
374 session = reportcfg.db_session
375 n_distinct_notnull = session.execute(
376 select(func.count(distinct(column)))
377 ).fetchone()[0]
378 # This does NOT include NULL values, by the SQL standard.
379 suffix = "" if n_distinct_notnull == 1 else "s" # "value" or "values"?
380 items.append(f"{n_distinct_notnull} distinct non-null value{suffix}.")
382 show_min_max = False
383 show_distinct = False # show the actual distinct values?
385 empty = n_distinct_notnull == 0
386 sensitive = (
387 not empty
388 and ddr
389 and (
390 ddr.contains_patient_info
391 or ddr.contains_third_party_info
392 or ddr.contains_scrub_src
393 or ddr.being_scrubbed
394 )
395 )
396 # ... not *actually* sensitive; merely having the appearance of being
397 # sensitive for a general-purpose report.
398 dull = (
399 not empty
400 and not sensitive
401 and reportcfg.use_dd
402 and not ddr
403 and column.name in reportcfg.annotation_from_colname.keys()
404 )
406 if not (empty or sensitive or dull):
407 # Show some more detail.
408 if n_distinct_notnull > 1:
409 show_min_max = True
410 if (
411 n_distinct_notnull <= reportcfg.max_distinct_values
412 or not reportcfg.skip_values_if_too_many
413 ):
414 show_distinct = True
416 def lit(value: Any) -> str:
417 return literal(value, reportcfg.max_value_length)
419 if show_min_max:
420 min_val, max_val = session.execute(
421 select(func.min(column), func.max(column))
422 ).fetchone()
423 items.append(f"Min {lit(min_val)}; max {lit(max_val)}.")
425 if show_distinct:
426 dv_rows = session.execute(
427 select(column)
428 .distinct()
429 .order_by(column)
430 .limit(reportcfg.max_value_length + 1)
431 ).fetchall()
432 # These WILL include any NULL values, so there may be one more than
433 # n_distinct_notnull (or the same, if there are no NULLs). The only
434 # way to be sure if we are truncating, therefore, and to show a
435 # truncation indicator, is to fetch up to one more and see if we are
436 # over the limit.
437 # Sort before literal (so we get numeric, not string, sort):
438 distinct_values = sorted((row[0] for row in dv_rows), key=sorter)
439 distinct_value_elements = [lit(v) for v in distinct_values]
440 if len(distinct_values) > reportcfg.max_distinct_values:
441 distinct_value_elements = distinct_value_elements[
442 0 : reportcfg.max_distinct_values
443 ] + [ELLIPSIS]
444 distinct_value_str = ", ".join(distinct_value_elements)
445 items.append(f"Distinct values: {{{distinct_value_str}}}.")
446 # It's a set, so use set notation.
448 return " ".join(items)
451def mk_table_html(table_name: str, reportcfg: ResearcherReportConfig) -> str:
452 """
453 Returns HTML for the per-table aspects of the report.
455 Args:
456 table_name:
457 Table to process.
458 reportcfg:
459 ResearcherReportConfig object, governing the report.
461 Returns:
462 HTML as a string.
463 """
464 log.info(f"Processing table: {table_name}")
465 dest_ddr_rows = (
466 reportcfg.anonconfig.dd.get_rows_for_dest_table(table_name)
467 if reportcfg.use_dd
468 else []
469 )
470 session = reportcfg.db_session
472 n_rows = (
473 session.execute(
474 select(func.count()).select_from(table(table_name))
475 ).fetchone()[0]
476 if reportcfg.show_counts
477 else None
478 )
479 # Rows versus records: https://dba.stackexchange.com/questions/31805/
481 t = reportcfg.db.metadata.tables[table_name] # type: Table
482 table_comment = t.comment or "" # may be blank
483 columns = [] # type: List[ReflectedColumnInfo]
484 for c in sorted(t.c, key=lambda x: x.name): # type: Column
485 log.debug(repr(c))
486 colname = c.name
487 if reportcfg.use_dd:
488 try:
489 ddr = next(x for x in dest_ddr_rows if x.dest_field == colname)
490 crate_annotation = ddr.report_dest_annotation()
491 except StopIteration:
492 ddr = None
493 crate_annotation = reportcfg.get_annotation_when_no_ddr_found(
494 col_name=colname
495 )
496 else:
497 ddr = None
498 crate_annotation = ""
499 values_info = get_values_summary(
500 column=c,
501 reportcfg=reportcfg,
502 ddr=ddr,
503 )
504 columns.append(
505 ReflectedColumnInfo(
506 column=c,
507 override_comment=mk_comment(reportcfg, c, ddr),
508 crate_annotation=crate_annotation,
509 values_info=values_info,
510 )
511 )
513 return render_to_string(
514 template(Templates.TABLE),
515 dict(
516 columns=columns,
517 n_rows=n_rows,
518 show_counts=reportcfg.show_counts,
519 show_values=reportcfg.show_values,
520 table_comment=table_comment,
521 table_name=table_name,
522 use_dd=reportcfg.use_dd,
523 ),
524 )
527def mk_researcher_report_html(
528 reportcfg: ResearcherReportConfig,
529) -> Tuple[str, str, str]:
530 """
531 Produces a researcher-oriented report about a destination database, as
532 HTML.
534 Args:
535 reportcfg:
536 ResearcherReportConfig object, governing the report.
538 Returns:
539 tuple: header_html, html, footer_html
540 """
541 # -------------------------------------------------------------------------
542 # 1. Set up Django for templates.
543 # -------------------------------------------------------------------------
544 # https://stackoverflow.com/questions/28123603
545 if not settings.configured:
546 # Settings will already be configured when testing with pytest
547 settings.configure(
548 TEMPLATES=[
549 {
550 "BACKEND": (
551 "django.template.backends.django.DjangoTemplates"
552 ),
553 "DIRS": [TEMPLATE_DIR],
554 }
555 ]
556 )
557 django.setup()
559 # -------------------------------------------------------------------------
560 # 2. Core variables
561 # -------------------------------------------------------------------------
562 db_name = reportcfg.get_db_name()
563 now = format_datetime(get_now_localtz_pendulum(), DateFormat.PRETTY)
564 title = f"{db_name}: CRATE researcher report, {now}"
565 css = render_to_string(
566 template(Templates.STYLE),
567 dict(base_font_size=reportcfg.base_font_size),
568 )
569 coredict = dict(title=title, css=css, now=now)
571 # -------------------------------------------------------------------------
572 # 3. Read header/footer (e.g. for PDF page numbers).
573 # -------------------------------------------------------------------------
574 header_html = render_to_string(template(Templates.PDF_HEADER), coredict)
575 footer_html = render_to_string(template(Templates.PDF_FOOTER), coredict)
577 # -------------------------------------------------------------------------
578 # 4. Scan the database.
579 # -------------------------------------------------------------------------
580 table_names = sorted(reportcfg.db.table_names) # reflects (introspects)
582 # -------------------------------------------------------------------------
583 # 5. Generate our main report.
584 # -------------------------------------------------------------------------
585 table_html_list = [
586 mk_table_html(table_name, reportcfg) for table_name in table_names
587 ]
588 html = render_to_string(
589 template(Templates.REPORT),
590 dict(
591 CRATE_VERSION=CRATE_VERSION,
592 db_engine=reportcfg.get_db_engine_type(),
593 db_name=db_name,
594 n_tables=len(table_names),
595 table_names=table_names,
596 tables_html="".join(table_html_list),
597 url=reportcfg.safe_db_url_if_selected(),
598 **coredict,
599 ),
600 )
602 # -------------------------------------------------------------------------
603 # 6. Return HTML components.
604 # -------------------------------------------------------------------------
605 return header_html, html, footer_html
608def mk_researcher_report_pdf(
609 reportcfg: ResearcherReportConfig,
610) -> bool:
611 """
612 Produces a researcher-oriented report about a destination database, as a
613 PDF.
615 Args:
616 reportcfg:
617 ResearcherReportConfig object, governing the report.
619 Returns:
620 success
621 """
622 header_html, html, footer_html = mk_researcher_report_html(reportcfg)
623 log.info(f"Writing to {reportcfg.output_filename}")
624 return make_pdf_on_disk_from_html(
625 html=html,
626 output_path=reportcfg.output_filename,
627 header_html=header_html,
628 footer_html=footer_html,
629 wkhtmltopdf_options=reportcfg.wkhtmltopdf_options(),
630 debug_options=reportcfg.debug_pdf,
631 debug_content=reportcfg.debug_pdf,
632 debug_wkhtmltopdf_args=reportcfg.debug_pdf,
633 )
636# =============================================================================
637# Main
638# =============================================================================
641def main() -> None:
642 """
643 Command-line entry point.
644 """
645 # noinspection PyTypeChecker
646 parser = argparse.ArgumentParser(
647 description=f"""
648Produce a researcher-oriented PDF report about a destination database.
649({CRATE_VERSION_PRETTY})
651Note: if wkhtmtopdf reports 'Too many open files', see
652- https://stackoverflow.com/q/25355697;
653- https://github.com/wkhtmltopdf/wkhtmltopdf/issues/3081;
654setting e.g. "ulimit -n 2048" is one solution.
656""",
657 formatter_class=RawDescriptionArgumentDefaultsRichHelpFormatter,
658 )
660 parser.add_argument("output", help="PDF output filename")
662 grp_db = parser.add_argument_group("DATABASE")
663 grp_db.add_argument(
664 "--config",
665 help=f"CRATE anonymisation config file, overriding environment "
666 f"variable {ANON_CONFIG_ENV_VAR}",
667 )
668 grp_db.add_argument(
669 "--noconfig",
670 action="store_true",
671 help="Do not use a config file (unusual)",
672 )
673 grp_db.add_argument(
674 "--db_url",
675 type=str,
676 default=None,
677 help="Database URL, overriding that in the config file",
678 )
679 grp_db.add_argument(
680 "--db_name",
681 type=str,
682 default=None,
683 help="Database name, overriding that in the config file; must be "
684 "specified if you use --db_url",
685 )
687 grp_detail = parser.add_argument_group("DETAIL")
688 grp_detail.add_argument(
689 "--show_url",
690 dest="show_url",
691 action="store_true",
692 default=False,
693 help="Include sanitised, password-safe version of database URL",
694 )
695 grp_detail.add_argument(
696 "--no_show_url",
697 dest="show_url",
698 action="store_false",
699 default=True,
700 help="Do not include database URL",
701 )
702 grp_detail.add_argument(
703 "--show_counts",
704 dest="show_counts",
705 action="store_true",
706 default=True,
707 help="Include row counts for each table",
708 )
709 grp_detail.add_argument(
710 "--no_show_counts",
711 dest="show_counts",
712 action="store_false",
713 default=False,
714 help="Do not include row counts",
715 )
716 grp_detail.add_argument(
717 "--use_dd",
718 dest="use_dd",
719 action="store_true",
720 default=True,
721 help="Use information obtainable from the CRATE data dictionary (DD), "
722 "including comments, annotations, and value suppression for "
723 "potentially sensitive fields; only sensible for reporting on a "
724 "database completely unrelated to the DD",
725 )
726 grp_detail.add_argument(
727 "--no_use_dd",
728 dest="use_dd",
729 action="store_false",
730 default=False,
731 help="Do not use information from the CRATE data dictionary",
732 )
733 grp_detail.add_argument(
734 "--show_values",
735 dest="show_values",
736 action="store_true",
737 default=True,
738 help="Include specimen values/ranges",
739 )
740 grp_detail.add_argument(
741 "--no_show_values",
742 dest="show_values",
743 action="store_false",
744 default=False,
745 help="Do not include specimen values/ranges",
746 )
747 grp_detail.add_argument(
748 "--max_distinct_values",
749 type=int,
750 default=Default.MAX_DISTINCT_VALUES,
751 help="Maximum number of distinct values to show, if applicable",
752 )
753 grp_detail.add_argument(
754 "--skip_values_if_too_many",
755 action="store_true",
756 help="If showing values, and there are more distinct values than the "
757 "maximum, omit them (rather than showing the first few)?",
758 )
759 grp_detail.add_argument(
760 "--max_value_length",
761 type=int,
762 default=Default.MAX_VALUE_LENGTH,
763 help="Maximum string length to show for a literal value",
764 )
766 grp_visuals = parser.add_argument_group("VISUALS")
767 grp_visuals.add_argument(
768 "--page_size",
769 default=Default.PAGE_SIZE,
770 help="Page size, i.e. paper type",
771 )
772 grp_visuals.add_argument(
773 "--margin_left_right",
774 default=Default.MARGIN_LEFT_RIGHT,
775 help="Page left/right margins, with units",
776 )
777 grp_visuals.add_argument(
778 "--margin_top_bottom",
779 default=Default.MARGIN_TOP_BOTTOM,
780 help="Page top/bottom margins for content, ignoring header/footer "
781 "(see --header_footer_spacing_mm), with units",
782 )
783 grp_visuals.add_argument(
784 "--header_footer_spacing_mm",
785 type=int,
786 default=Default.HEADER_FOOTER_SPACING_MM,
787 help="Gap between content and header/footer, in mm",
788 )
789 grp_visuals.add_argument(
790 "--orientation",
791 choices=["portrait", "landscape"],
792 default=Default.ORIENTATION,
793 help="Page orientation",
794 )
795 grp_visuals.add_argument(
796 "--base_font_size",
797 default=Default.BASE_FONT_SIZE,
798 help="Base font size, with units",
799 )
801 grp_progress = parser.add_argument_group("PROGRESS")
802 grp_progress.add_argument(
803 "--verbose", "-v", action="store_true", help="Be verbose"
804 )
805 grp_progress.add_argument(
806 "--debug_pdf", action="store_true", help="Debug PDF creation"
807 )
809 args = parser.parse_args()
811 # -------------------------------------------------------------------------
812 # Verbosity, logging
813 # -------------------------------------------------------------------------
815 loglevel = logging.DEBUG if args.verbose else logging.INFO
816 main_only_quicksetup_rootlogger(level=loglevel)
818 # -------------------------------------------------------------------------
819 # Onwards
820 # -------------------------------------------------------------------------
822 if args.config:
823 os.environ[ANON_CONFIG_ENV_VAR] = args.config
824 if args.noconfig:
825 log.info("Not using a CRATE anonymisation config file")
826 config = None
827 else:
828 from crate_anon.anonymise.config_singleton import (
829 config,
830 )
832 reportcfg = ResearcherReportConfig(
833 anonconfig=config,
834 base_font_size=args.base_font_size,
835 db_name=args.db_name,
836 db_url=args.db_url,
837 debug_pdf=args.debug_pdf,
838 header_footer_spacing_mm=args.header_footer_spacing_mm,
839 margin_left_right=args.margin_left_right,
840 margin_top_bottom=args.margin_top_bottom,
841 max_distinct_values=args.max_distinct_values,
842 max_value_length=args.max_value_length,
843 orientation=args.orientation,
844 output_filename=args.output,
845 page_size=args.page_size,
846 show_counts=args.show_counts,
847 show_url=args.show_url,
848 show_values=args.show_values,
849 skip_values_if_too_many=args.skip_values_if_too_many,
850 use_dd=args.use_dd,
851 )
853 mk_researcher_report_pdf(reportcfg)
856if __name__ == "__main__":
857 main()