Module taulu.header_template

A HeaderTemplate defines the structure of a table header.

Classes

class HeaderTemplate (rules: Iterable[Iterable[int]])
Expand source code
class HeaderTemplate(TableIndexer):
    def __init__(self, rules: Iterable[Iterable[int]]):
        """
        A TableTemplate is a collection of rules of a table. This class implements methods
        for finding cell positions in a table image, given the template the image adheres to.

        Args:
            rules: 2D array of lines, where each line is represented as [x0, y0, x1, y1]
        """

        super().__init__()
        self._rules = [_Rule(*rule) for rule in rules]
        self._h_rules = sorted(
            [rule for rule in self._rules if rule._is_horizontal()], key=lambda r: r._y
        )
        self._v_rules = sorted(
            [rule for rule in self._rules if rule._is_vertical()], key=lambda r: r._x
        )

    @log_calls(level=logging.DEBUG)
    def save(self, path: PathLike[str]):
        """
        Save the HeaderTemplate to the given path, as a json
        """

        data = {"rules": [r.to_dict() for r in self._rules]}

        with open(path, "w") as f:
            json.dump(data, f)

    @staticmethod
    @log_calls(level=logging.DEBUG)
    def from_saved(path: PathLike[str]) -> "HeaderTemplate":
        with open(path, "r") as f:
            data = json.load(f)
            rules = data["rules"]
            rules = [[r["x0"], r["y0"], r["x1"], r["y1"]] for r in rules]

            return HeaderTemplate(rules)

    @property
    def cols(self) -> int:
        return len(self._v_rules) - 1

    @property
    def rows(self) -> int:
        return len(self._h_rules) - 1

    @staticmethod
    @log_calls(level=logging.DEBUG)
    def annotate_image(
        template: MatLike | str, crop: Optional[PathLike[str]] = None, margin: int = 10
    ) -> "HeaderTemplate":
        """
        Utility method that allows users to create a template form a template image.

        The user is asked to click to annotate lines (two clicks per line).

        Args:
            template: the image on which to annotate the header lines
            crop (str | None): if str, crop the template image first, then do the annotation.
                The cropped image will be stored at the supplied path
            margin (int): margin to add around the cropping of the header
        """

        if type(template) is str:
            value = cv.imread(template)
            template = value
        template = cast(MatLike, template)

        if crop is not None:
            cropped = HeaderTemplate._crop(template, margin)
            cv.imwrite(os.fspath(crop), cropped)
            template = cropped

        start_point = None
        lines: list[list[int]] = []

        anno_template = np.copy(template)

        def get_point(event, x, y, flags, params):
            nonlocal lines, start_point, anno_template
            _ = flags
            _ = params
            if event == cv.EVENT_LBUTTONDOWN:
                if start_point is not None:
                    line: list[int] = [start_point[1], start_point[0], x, y]

                    cv.line(  # type:ignore
                        anno_template,  # type:ignore
                        (start_point[1], start_point[0]),
                        (x, y),
                        (0, 255, 0),
                        2,
                        cv.LINE_AA,
                    )
                    cv.imshow(constants.WINDOW, anno_template)  # type:ignore

                    lines.append(line)
                    start_point = None
                else:
                    start_point = (y, x)
            elif event == cv.EVENT_RBUTTONDOWN:
                start_point = None

                # remove the last annotation
                lines = lines[:-1]

                anno_template = np.copy(anno_template)

                for line in lines:
                    cv.line(
                        template,
                        (line[0], line[1]),
                        (line[2], line[3]),
                        (0, 255, 0),
                        2,
                        cv.LINE_AA,
                    )

                cv.imshow(constants.WINDOW, template)

        print(ANNO_HELP)

        imu.show(anno_template, get_point, title="annotate the header")

        return HeaderTemplate(lines)

    @staticmethod
    @log_calls(level=logging.DEBUG, include_return=True)
    def _crop(template: MatLike, margin: int = 10) -> MatLike:
        """
        Crop the image to contain only the annotations, such that it can be used as the header image in the taulu workflow.
        """

        points = []
        anno_template = np.copy(template)

        def get_point(event, x, y, flags, params):
            nonlocal points, anno_template
            _ = flags
            _ = params
            if event == cv.EVENT_LBUTTONDOWN:
                point = (x, y)

                cv.circle(  # type:ignore
                    anno_template,  # type:ignore
                    (x, y),
                    4,
                    (0, 255, 0),
                    2,
                )
                cv.imshow(constants.WINDOW, anno_template)  # type:ignore

                points.append(point)
            elif event == cv.EVENT_RBUTTONDOWN:
                # remove the last annotation
                points = points[:-1]

                anno_template = np.copy(anno_template)

                for p in points:
                    cv.circle(
                        anno_template,
                        p,
                        4,
                        (0, 255, 0),
                        2,
                    )

                cv.imshow(constants.WINDOW, anno_template)

        print(CROP_HELP)

        imu.show(anno_template, get_point, title="crop the header")

        assert len(points) == 4, (
            "you need to annotate the four corners of the table in order to crop it"
        )

        # crop the image to contain all of the points (just crop rectangularly, x, y, w, h)
        # Convert points to numpy array
        points_np = np.array(points)

        # Find bounding box
        x_min = np.min(points_np[:, 0])
        y_min = np.min(points_np[:, 1])
        x_max = np.max(points_np[:, 0])
        y_max = np.max(points_np[:, 1])

        # Compute width and height
        width = x_max - x_min
        height = y_max - y_min

        # Ensure integers and within image boundaries
        x_min = max(int(x_min), 0)
        y_min = max(int(y_min), 0)
        width = int(width)
        height = int(height)

        # Crop the image
        cropped = template[
            y_min - margin : y_min + height + margin,
            x_min - margin : x_min + width + margin,
        ]

        return cropped

    @staticmethod
    def from_vgg_annotation(annotation: str) -> "HeaderTemplate":
        """
        Create a TableTemplate from annotations made in [vgg](https://annotate.officialstatistics.org/), using the polylines tool.

        Args:
            annotation (str): the path of the annotation csv file
        """

        rules = []
        with open(annotation, "r") as csvfile:
            reader = csv.DictReader(csvfile)
            for row in reader:
                shape_attributes = json.loads(row["region_shape_attributes"])
                if shape_attributes["name"] == "polyline":
                    x_points = shape_attributes["all_points_x"]
                    y_points = shape_attributes["all_points_y"]
                    if len(x_points) == 2 and len(y_points) == 2:
                        rules.append(
                            [x_points[0], y_points[0], x_points[1], y_points[1]]
                        )

        return HeaderTemplate(rules)

    def cell_width(self, i: int) -> int:
        self._check_col_idx(i)
        return int(self._v_rules[i + 1]._x - self._v_rules[i]._x)

    def cell_widths(self, start: int = 0) -> list[int]:
        return [self.cell_width(i) for i in range(start, self.cols)]

    def cell_height(self, header_factor: float = 0.8) -> int:
        return int((self._h_rules[1]._y - self._h_rules[0]._y) * header_factor)

    def cell_heights(self, header_factors: list[float] | float) -> list[int]:
        if isinstance(header_factors, float):
            header_factors = [header_factors]
        header_factors = cast(list, header_factors)
        return [
            int((self._h_rules[1]._y - self._h_rules[0]._y) * f) for f in header_factors
        ]

    def intersection(self, index: tuple[int, int]) -> tuple[float, float]:
        """
        Returns the interaction of the index[0]th horizontal rule and the
        index[1]th vertical rule
        """

        ints = self._h_rules[index[0]].intersection(self._v_rules[index[1]])
        assert ints is not None
        return ints

    def cell(self, point: tuple[float, float]) -> tuple[int, int]:
        """
        Get the cell index (row, col) that corresponds with the point (x, y) in the template image

        Args:
            point (tuple[float, float]): the coordinates in the template image

        Returns:
            tuple[int, int]: (row, col)
        """

        x, y = point

        row = -1
        col = -1

        for i in range(self.rows):
            y0 = self._h_rules[i]._y_at_x(x)
            y1 = self._h_rules[i + 1]._y_at_x(x)
            if min(y0, y1) <= y <= max(y0, y1):
                row = i
                break

        for i in range(self.cols):
            x0 = self._v_rules[i]._x_at_y(y)
            x1 = self._v_rules[i + 1]._x_at_y(y)
            if min(x0, x1) <= x <= max(x0, x1):
                col = i
                break

        if row == -1 or col == -1:
            return (-1, -1)

        return (row, col)

    def cell_polygon(
        self, cell: tuple[int, int]
    ) -> tuple[tuple[int, int], tuple[int, int], tuple[int, int], tuple[int, int]]:
        """
        Return points (x,y) that make up a polygon around the requested cell
        (top left, top right, bottom right, bottom left)
        """

        row, col = cell

        self._check_col_idx(col)
        self._check_row_idx(row)

        top_rule = self._h_rules[row]
        bottom_rule = self._h_rules[row + 1]
        left_rule = self._v_rules[col]
        right_rule = self._v_rules[col + 1]

        # Calculate corner points using intersections
        top_left = top_rule.intersection(left_rule)
        top_right = top_rule.intersection(right_rule)
        bottom_left = bottom_rule.intersection(left_rule)
        bottom_right = bottom_rule.intersection(right_rule)

        if not all(
            [
                point is not None
                for point in [top_left, top_right, bottom_left, bottom_right]
            ]
        ):
            raise TauluException("the lines around this cell do not intersect")

        return top_left, top_right, bottom_right, bottom_left  # type:ignore

    def region(
        self, start: tuple[int, int], end: tuple[int, int]
    ) -> tuple[Point, Point, Point, Point]:
        self._check_row_idx(start[0])
        self._check_row_idx(end[0])
        self._check_col_idx(start[1])
        self._check_col_idx(end[1])

        # the rules that surround this row
        top_rule = self._h_rules[start[0]]
        bottom_rule = self._h_rules[end[0] + 1]
        left_rule = self._v_rules[start[1]]
        right_rule = self._v_rules[end[1] + 1]

        # four points that will be the bounding polygon of the result,
        # which needs to be rectified
        top_left = top_rule.intersection(left_rule)
        top_right = top_rule.intersection(right_rule)
        bottom_left = bottom_rule.intersection(left_rule)
        bottom_right = bottom_rule.intersection(right_rule)

        if (
            top_left is None
            or top_right is None
            or bottom_left is None
            or bottom_right is None
        ):
            raise TauluException("the lines around this row do not intersect properly")

        def to_point(pnt) -> Point:
            return (int(pnt[0]), int(pnt[1]))

        return (
            to_point(top_left),
            to_point(top_right),
            to_point(bottom_right),
            to_point(bottom_left),
        )

    def text_regions(
        self, img: MatLike, row: int, margin_x: int = 10, margin_y: int = -20
    ) -> list[tuple[tuple[int, int], tuple[int, int]]]:
        raise TauluException("text_regions should not be called on a HeaderTemplate")

Subclasses implement methods for going from a pixel in the input image to a table cell index, and cropping an image to the given table cell index.

A TableTemplate is a collection of rules of a table. This class implements methods for finding cell positions in a table image, given the template the image adheres to.

Args

rules
2D array of lines, where each line is represented as [x0, y0, x1, y1]

Ancestors

Static methods

def annotate_image(template: cv2.Mat | numpy.ndarray | str,
crop: os.PathLike[str] | None = None,
margin: int = 10) ‑> HeaderTemplate
Expand source code
@staticmethod
@log_calls(level=logging.DEBUG)
def annotate_image(
    template: MatLike | str, crop: Optional[PathLike[str]] = None, margin: int = 10
) -> "HeaderTemplate":
    """
    Utility method that allows users to create a template form a template image.

    The user is asked to click to annotate lines (two clicks per line).

    Args:
        template: the image on which to annotate the header lines
        crop (str | None): if str, crop the template image first, then do the annotation.
            The cropped image will be stored at the supplied path
        margin (int): margin to add around the cropping of the header
    """

    if type(template) is str:
        value = cv.imread(template)
        template = value
    template = cast(MatLike, template)

    if crop is not None:
        cropped = HeaderTemplate._crop(template, margin)
        cv.imwrite(os.fspath(crop), cropped)
        template = cropped

    start_point = None
    lines: list[list[int]] = []

    anno_template = np.copy(template)

    def get_point(event, x, y, flags, params):
        nonlocal lines, start_point, anno_template
        _ = flags
        _ = params
        if event == cv.EVENT_LBUTTONDOWN:
            if start_point is not None:
                line: list[int] = [start_point[1], start_point[0], x, y]

                cv.line(  # type:ignore
                    anno_template,  # type:ignore
                    (start_point[1], start_point[0]),
                    (x, y),
                    (0, 255, 0),
                    2,
                    cv.LINE_AA,
                )
                cv.imshow(constants.WINDOW, anno_template)  # type:ignore

                lines.append(line)
                start_point = None
            else:
                start_point = (y, x)
        elif event == cv.EVENT_RBUTTONDOWN:
            start_point = None

            # remove the last annotation
            lines = lines[:-1]

            anno_template = np.copy(anno_template)

            for line in lines:
                cv.line(
                    template,
                    (line[0], line[1]),
                    (line[2], line[3]),
                    (0, 255, 0),
                    2,
                    cv.LINE_AA,
                )

            cv.imshow(constants.WINDOW, template)

    print(ANNO_HELP)

    imu.show(anno_template, get_point, title="annotate the header")

    return HeaderTemplate(lines)

Utility method that allows users to create a template form a template image.

The user is asked to click to annotate lines (two clicks per line).

Args

template
the image on which to annotate the header lines
crop : str | None
if str, crop the template image first, then do the annotation. The cropped image will be stored at the supplied path
margin : int
margin to add around the cropping of the header
def from_saved(path: os.PathLike[str]) ‑> HeaderTemplate
Expand source code
@staticmethod
@log_calls(level=logging.DEBUG)
def from_saved(path: PathLike[str]) -> "HeaderTemplate":
    with open(path, "r") as f:
        data = json.load(f)
        rules = data["rules"]
        rules = [[r["x0"], r["y0"], r["x1"], r["y1"]] for r in rules]

        return HeaderTemplate(rules)
def from_vgg_annotation(annotation: str) ‑> HeaderTemplate
Expand source code
@staticmethod
def from_vgg_annotation(annotation: str) -> "HeaderTemplate":
    """
    Create a TableTemplate from annotations made in [vgg](https://annotate.officialstatistics.org/), using the polylines tool.

    Args:
        annotation (str): the path of the annotation csv file
    """

    rules = []
    with open(annotation, "r") as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            shape_attributes = json.loads(row["region_shape_attributes"])
            if shape_attributes["name"] == "polyline":
                x_points = shape_attributes["all_points_x"]
                y_points = shape_attributes["all_points_y"]
                if len(x_points) == 2 and len(y_points) == 2:
                    rules.append(
                        [x_points[0], y_points[0], x_points[1], y_points[1]]
                    )

    return HeaderTemplate(rules)

Create a TableTemplate from annotations made in vgg, using the polylines tool.

Args

annotation : str
the path of the annotation csv file

Instance variables

prop cols : int
Expand source code
@property
def cols(self) -> int:
    return len(self._v_rules) - 1
prop rows : int
Expand source code
@property
def rows(self) -> int:
    return len(self._h_rules) - 1

Methods

def cell(self, point: tuple[float, float]) ‑> tuple[int, int]
Expand source code
def cell(self, point: tuple[float, float]) -> tuple[int, int]:
    """
    Get the cell index (row, col) that corresponds with the point (x, y) in the template image

    Args:
        point (tuple[float, float]): the coordinates in the template image

    Returns:
        tuple[int, int]: (row, col)
    """

    x, y = point

    row = -1
    col = -1

    for i in range(self.rows):
        y0 = self._h_rules[i]._y_at_x(x)
        y1 = self._h_rules[i + 1]._y_at_x(x)
        if min(y0, y1) <= y <= max(y0, y1):
            row = i
            break

    for i in range(self.cols):
        x0 = self._v_rules[i]._x_at_y(y)
        x1 = self._v_rules[i + 1]._x_at_y(y)
        if min(x0, x1) <= x <= max(x0, x1):
            col = i
            break

    if row == -1 or col == -1:
        return (-1, -1)

    return (row, col)

Get the cell index (row, col) that corresponds with the point (x, y) in the template image

Args

point : tuple[float, float]
the coordinates in the template image

Returns

tuple[int, int]
(row, col)
def cell_height(self, header_factor: float = 0.8) ‑> int
Expand source code
def cell_height(self, header_factor: float = 0.8) -> int:
    return int((self._h_rules[1]._y - self._h_rules[0]._y) * header_factor)
def cell_heights(self, header_factors: list[float] | float) ‑> list[int]
Expand source code
def cell_heights(self, header_factors: list[float] | float) -> list[int]:
    if isinstance(header_factors, float):
        header_factors = [header_factors]
    header_factors = cast(list, header_factors)
    return [
        int((self._h_rules[1]._y - self._h_rules[0]._y) * f) for f in header_factors
    ]
def cell_polygon(self, cell: tuple[int, int]) ‑> tuple[tuple[int, int], tuple[int, int], tuple[int, int], tuple[int, int]]
Expand source code
def cell_polygon(
    self, cell: tuple[int, int]
) -> tuple[tuple[int, int], tuple[int, int], tuple[int, int], tuple[int, int]]:
    """
    Return points (x,y) that make up a polygon around the requested cell
    (top left, top right, bottom right, bottom left)
    """

    row, col = cell

    self._check_col_idx(col)
    self._check_row_idx(row)

    top_rule = self._h_rules[row]
    bottom_rule = self._h_rules[row + 1]
    left_rule = self._v_rules[col]
    right_rule = self._v_rules[col + 1]

    # Calculate corner points using intersections
    top_left = top_rule.intersection(left_rule)
    top_right = top_rule.intersection(right_rule)
    bottom_left = bottom_rule.intersection(left_rule)
    bottom_right = bottom_rule.intersection(right_rule)

    if not all(
        [
            point is not None
            for point in [top_left, top_right, bottom_left, bottom_right]
        ]
    ):
        raise TauluException("the lines around this cell do not intersect")

    return top_left, top_right, bottom_right, bottom_left  # type:ignore

Return points (x,y) that make up a polygon around the requested cell (top left, top right, bottom right, bottom left)

def cell_width(self, i: int) ‑> int
Expand source code
def cell_width(self, i: int) -> int:
    self._check_col_idx(i)
    return int(self._v_rules[i + 1]._x - self._v_rules[i]._x)
def cell_widths(self, start: int = 0) ‑> list[int]
Expand source code
def cell_widths(self, start: int = 0) -> list[int]:
    return [self.cell_width(i) for i in range(start, self.cols)]
def intersection(self, index: tuple[int, int]) ‑> tuple[float, float]
Expand source code
def intersection(self, index: tuple[int, int]) -> tuple[float, float]:
    """
    Returns the interaction of the index[0]th horizontal rule and the
    index[1]th vertical rule
    """

    ints = self._h_rules[index[0]].intersection(self._v_rules[index[1]])
    assert ints is not None
    return ints

Returns the interaction of the index[0]th horizontal rule and the index[1]th vertical rule

def save(self, path: os.PathLike[str])
Expand source code
@log_calls(level=logging.DEBUG)
def save(self, path: PathLike[str]):
    """
    Save the HeaderTemplate to the given path, as a json
    """

    data = {"rules": [r.to_dict() for r in self._rules]}

    with open(path, "w") as f:
        json.dump(data, f)

Save the HeaderTemplate to the given path, as a json

Inherited members