Module taulu.header_template
A HeaderTemplate defines the structure of a table header.
Classes
class HeaderTemplate (rules: Iterable[Iterable[int]])-
Expand source code
class HeaderTemplate(TableIndexer): def __init__(self, rules: Iterable[Iterable[int]]): """ A TableTemplate is a collection of rules of a table. This class implements methods for finding cell positions in a table image, given the template the image adheres to. Args: rules: 2D array of lines, where each line is represented as [x0, y0, x1, y1] """ super().__init__() self._rules = [_Rule(*rule) for rule in rules] self._h_rules = sorted( [rule for rule in self._rules if rule._is_horizontal()], key=lambda r: r._y ) self._v_rules = sorted( [rule for rule in self._rules if rule._is_vertical()], key=lambda r: r._x ) @log_calls(level=logging.DEBUG) def save(self, path: PathLike[str]): """ Save the HeaderTemplate to the given path, as a json """ data = {"rules": [r.to_dict() for r in self._rules]} with open(path, "w") as f: json.dump(data, f) @staticmethod @log_calls(level=logging.DEBUG) def from_saved(path: PathLike[str]) -> "HeaderTemplate": with open(path, "r") as f: data = json.load(f) rules = data["rules"] rules = [[r["x0"], r["y0"], r["x1"], r["y1"]] for r in rules] return HeaderTemplate(rules) @property def cols(self) -> int: return len(self._v_rules) - 1 @property def rows(self) -> int: return len(self._h_rules) - 1 @staticmethod @log_calls(level=logging.DEBUG) def annotate_image( template: MatLike | str, crop: Optional[PathLike[str]] = None, margin: int = 10 ) -> "HeaderTemplate": """ Utility method that allows users to create a template form a template image. The user is asked to click to annotate lines (two clicks per line). Args: template: the image on which to annotate the header lines crop (str | None): if str, crop the template image first, then do the annotation. The cropped image will be stored at the supplied path margin (int): margin to add around the cropping of the header """ if type(template) is str: value = cv.imread(template) template = value template = cast(MatLike, template) if crop is not None: cropped = HeaderTemplate._crop(template, margin) cv.imwrite(os.fspath(crop), cropped) template = cropped start_point = None lines: list[list[int]] = [] anno_template = np.copy(template) def get_point(event, x, y, flags, params): nonlocal lines, start_point, anno_template _ = flags _ = params if event == cv.EVENT_LBUTTONDOWN: if start_point is not None: line: list[int] = [start_point[1], start_point[0], x, y] cv.line( # type:ignore anno_template, # type:ignore (start_point[1], start_point[0]), (x, y), (0, 255, 0), 2, cv.LINE_AA, ) cv.imshow(constants.WINDOW, anno_template) # type:ignore lines.append(line) start_point = None else: start_point = (y, x) elif event == cv.EVENT_RBUTTONDOWN: start_point = None # remove the last annotation lines = lines[:-1] anno_template = np.copy(anno_template) for line in lines: cv.line( template, (line[0], line[1]), (line[2], line[3]), (0, 255, 0), 2, cv.LINE_AA, ) cv.imshow(constants.WINDOW, template) print(ANNO_HELP) imu.show(anno_template, get_point, title="annotate the header") return HeaderTemplate(lines) @staticmethod @log_calls(level=logging.DEBUG, include_return=True) def _crop(template: MatLike, margin: int = 10) -> MatLike: """ Crop the image to contain only the annotations, such that it can be used as the header image in the taulu workflow. """ points = [] anno_template = np.copy(template) def get_point(event, x, y, flags, params): nonlocal points, anno_template _ = flags _ = params if event == cv.EVENT_LBUTTONDOWN: point = (x, y) cv.circle( # type:ignore anno_template, # type:ignore (x, y), 4, (0, 255, 0), 2, ) cv.imshow(constants.WINDOW, anno_template) # type:ignore points.append(point) elif event == cv.EVENT_RBUTTONDOWN: # remove the last annotation points = points[:-1] anno_template = np.copy(anno_template) for p in points: cv.circle( anno_template, p, 4, (0, 255, 0), 2, ) cv.imshow(constants.WINDOW, anno_template) print(CROP_HELP) imu.show(anno_template, get_point, title="crop the header") assert len(points) == 4, ( "you need to annotate the four corners of the table in order to crop it" ) # crop the image to contain all of the points (just crop rectangularly, x, y, w, h) # Convert points to numpy array points_np = np.array(points) # Find bounding box x_min = np.min(points_np[:, 0]) y_min = np.min(points_np[:, 1]) x_max = np.max(points_np[:, 0]) y_max = np.max(points_np[:, 1]) # Compute width and height width = x_max - x_min height = y_max - y_min # Ensure integers and within image boundaries x_min = max(int(x_min), 0) y_min = max(int(y_min), 0) width = int(width) height = int(height) # Crop the image cropped = template[ y_min - margin : y_min + height + margin, x_min - margin : x_min + width + margin, ] return cropped @staticmethod def from_vgg_annotation(annotation: str) -> "HeaderTemplate": """ Create a TableTemplate from annotations made in [vgg](https://annotate.officialstatistics.org/), using the polylines tool. Args: annotation (str): the path of the annotation csv file """ rules = [] with open(annotation, "r") as csvfile: reader = csv.DictReader(csvfile) for row in reader: shape_attributes = json.loads(row["region_shape_attributes"]) if shape_attributes["name"] == "polyline": x_points = shape_attributes["all_points_x"] y_points = shape_attributes["all_points_y"] if len(x_points) == 2 and len(y_points) == 2: rules.append( [x_points[0], y_points[0], x_points[1], y_points[1]] ) return HeaderTemplate(rules) def cell_width(self, i: int) -> int: self._check_col_idx(i) return int(self._v_rules[i + 1]._x - self._v_rules[i]._x) def cell_widths(self, start: int = 0) -> list[int]: return [self.cell_width(i) for i in range(start, self.cols)] def cell_height(self, header_factor: float = 0.8) -> int: return int((self._h_rules[1]._y - self._h_rules[0]._y) * header_factor) def cell_heights(self, header_factors: list[float] | float) -> list[int]: if isinstance(header_factors, float): header_factors = [header_factors] header_factors = cast(list, header_factors) return [ int((self._h_rules[1]._y - self._h_rules[0]._y) * f) for f in header_factors ] def intersection(self, index: tuple[int, int]) -> tuple[float, float]: """ Returns the interaction of the index[0]th horizontal rule and the index[1]th vertical rule """ ints = self._h_rules[index[0]].intersection(self._v_rules[index[1]]) assert ints is not None return ints def cell(self, point: tuple[float, float]) -> tuple[int, int]: """ Get the cell index (row, col) that corresponds with the point (x, y) in the template image Args: point (tuple[float, float]): the coordinates in the template image Returns: tuple[int, int]: (row, col) """ x, y = point row = -1 col = -1 for i in range(self.rows): y0 = self._h_rules[i]._y_at_x(x) y1 = self._h_rules[i + 1]._y_at_x(x) if min(y0, y1) <= y <= max(y0, y1): row = i break for i in range(self.cols): x0 = self._v_rules[i]._x_at_y(y) x1 = self._v_rules[i + 1]._x_at_y(y) if min(x0, x1) <= x <= max(x0, x1): col = i break if row == -1 or col == -1: return (-1, -1) return (row, col) def cell_polygon( self, cell: tuple[int, int] ) -> tuple[tuple[int, int], tuple[int, int], tuple[int, int], tuple[int, int]]: """ Return points (x,y) that make up a polygon around the requested cell (top left, top right, bottom right, bottom left) """ row, col = cell self._check_col_idx(col) self._check_row_idx(row) top_rule = self._h_rules[row] bottom_rule = self._h_rules[row + 1] left_rule = self._v_rules[col] right_rule = self._v_rules[col + 1] # Calculate corner points using intersections top_left = top_rule.intersection(left_rule) top_right = top_rule.intersection(right_rule) bottom_left = bottom_rule.intersection(left_rule) bottom_right = bottom_rule.intersection(right_rule) if not all( [ point is not None for point in [top_left, top_right, bottom_left, bottom_right] ] ): raise TauluException("the lines around this cell do not intersect") return top_left, top_right, bottom_right, bottom_left # type:ignore def region( self, start: tuple[int, int], end: tuple[int, int] ) -> tuple[Point, Point, Point, Point]: self._check_row_idx(start[0]) self._check_row_idx(end[0]) self._check_col_idx(start[1]) self._check_col_idx(end[1]) # the rules that surround this row top_rule = self._h_rules[start[0]] bottom_rule = self._h_rules[end[0] + 1] left_rule = self._v_rules[start[1]] right_rule = self._v_rules[end[1] + 1] # four points that will be the bounding polygon of the result, # which needs to be rectified top_left = top_rule.intersection(left_rule) top_right = top_rule.intersection(right_rule) bottom_left = bottom_rule.intersection(left_rule) bottom_right = bottom_rule.intersection(right_rule) if ( top_left is None or top_right is None or bottom_left is None or bottom_right is None ): raise TauluException("the lines around this row do not intersect properly") def to_point(pnt) -> Point: return (int(pnt[0]), int(pnt[1])) return ( to_point(top_left), to_point(top_right), to_point(bottom_right), to_point(bottom_left), ) def text_regions( self, img: MatLike, row: int, margin_x: int = 10, margin_y: int = -20 ) -> list[tuple[tuple[int, int], tuple[int, int]]]: raise TauluException("text_regions should not be called on a HeaderTemplate")Subclasses implement methods for going from a pixel in the input image to a table cell index, and cropping an image to the given table cell index.
A TableTemplate is a collection of rules of a table. This class implements methods for finding cell positions in a table image, given the template the image adheres to.
Args
rules- 2D array of lines, where each line is represented as [x0, y0, x1, y1]
Ancestors
- TableIndexer
- abc.ABC
Static methods
def annotate_image(template: cv2.Mat | numpy.ndarray | str,
crop: os.PathLike[str] | None = None,
margin: int = 10) ‑> HeaderTemplate-
Expand source code
@staticmethod @log_calls(level=logging.DEBUG) def annotate_image( template: MatLike | str, crop: Optional[PathLike[str]] = None, margin: int = 10 ) -> "HeaderTemplate": """ Utility method that allows users to create a template form a template image. The user is asked to click to annotate lines (two clicks per line). Args: template: the image on which to annotate the header lines crop (str | None): if str, crop the template image first, then do the annotation. The cropped image will be stored at the supplied path margin (int): margin to add around the cropping of the header """ if type(template) is str: value = cv.imread(template) template = value template = cast(MatLike, template) if crop is not None: cropped = HeaderTemplate._crop(template, margin) cv.imwrite(os.fspath(crop), cropped) template = cropped start_point = None lines: list[list[int]] = [] anno_template = np.copy(template) def get_point(event, x, y, flags, params): nonlocal lines, start_point, anno_template _ = flags _ = params if event == cv.EVENT_LBUTTONDOWN: if start_point is not None: line: list[int] = [start_point[1], start_point[0], x, y] cv.line( # type:ignore anno_template, # type:ignore (start_point[1], start_point[0]), (x, y), (0, 255, 0), 2, cv.LINE_AA, ) cv.imshow(constants.WINDOW, anno_template) # type:ignore lines.append(line) start_point = None else: start_point = (y, x) elif event == cv.EVENT_RBUTTONDOWN: start_point = None # remove the last annotation lines = lines[:-1] anno_template = np.copy(anno_template) for line in lines: cv.line( template, (line[0], line[1]), (line[2], line[3]), (0, 255, 0), 2, cv.LINE_AA, ) cv.imshow(constants.WINDOW, template) print(ANNO_HELP) imu.show(anno_template, get_point, title="annotate the header") return HeaderTemplate(lines)Utility method that allows users to create a template form a template image.
The user is asked to click to annotate lines (two clicks per line).
Args
template- the image on which to annotate the header lines
crop:str | None- if str, crop the template image first, then do the annotation. The cropped image will be stored at the supplied path
margin:int- margin to add around the cropping of the header
def from_saved(path: os.PathLike[str]) ‑> HeaderTemplate-
Expand source code
@staticmethod @log_calls(level=logging.DEBUG) def from_saved(path: PathLike[str]) -> "HeaderTemplate": with open(path, "r") as f: data = json.load(f) rules = data["rules"] rules = [[r["x0"], r["y0"], r["x1"], r["y1"]] for r in rules] return HeaderTemplate(rules) def from_vgg_annotation(annotation: str) ‑> HeaderTemplate-
Expand source code
@staticmethod def from_vgg_annotation(annotation: str) -> "HeaderTemplate": """ Create a TableTemplate from annotations made in [vgg](https://annotate.officialstatistics.org/), using the polylines tool. Args: annotation (str): the path of the annotation csv file """ rules = [] with open(annotation, "r") as csvfile: reader = csv.DictReader(csvfile) for row in reader: shape_attributes = json.loads(row["region_shape_attributes"]) if shape_attributes["name"] == "polyline": x_points = shape_attributes["all_points_x"] y_points = shape_attributes["all_points_y"] if len(x_points) == 2 and len(y_points) == 2: rules.append( [x_points[0], y_points[0], x_points[1], y_points[1]] ) return HeaderTemplate(rules)Create a TableTemplate from annotations made in vgg, using the polylines tool.
Args
annotation:str- the path of the annotation csv file
Instance variables
prop cols : int-
Expand source code
@property def cols(self) -> int: return len(self._v_rules) - 1 prop rows : int-
Expand source code
@property def rows(self) -> int: return len(self._h_rules) - 1
Methods
def cell(self, point: tuple[float, float]) ‑> tuple[int, int]-
Expand source code
def cell(self, point: tuple[float, float]) -> tuple[int, int]: """ Get the cell index (row, col) that corresponds with the point (x, y) in the template image Args: point (tuple[float, float]): the coordinates in the template image Returns: tuple[int, int]: (row, col) """ x, y = point row = -1 col = -1 for i in range(self.rows): y0 = self._h_rules[i]._y_at_x(x) y1 = self._h_rules[i + 1]._y_at_x(x) if min(y0, y1) <= y <= max(y0, y1): row = i break for i in range(self.cols): x0 = self._v_rules[i]._x_at_y(y) x1 = self._v_rules[i + 1]._x_at_y(y) if min(x0, x1) <= x <= max(x0, x1): col = i break if row == -1 or col == -1: return (-1, -1) return (row, col)Get the cell index (row, col) that corresponds with the point (x, y) in the template image
Args
point:tuple[float, float]- the coordinates in the template image
Returns
tuple[int, int]- (row, col)
def cell_height(self, header_factor: float = 0.8) ‑> int-
Expand source code
def cell_height(self, header_factor: float = 0.8) -> int: return int((self._h_rules[1]._y - self._h_rules[0]._y) * header_factor) def cell_heights(self, header_factors: list[float] | float) ‑> list[int]-
Expand source code
def cell_heights(self, header_factors: list[float] | float) -> list[int]: if isinstance(header_factors, float): header_factors = [header_factors] header_factors = cast(list, header_factors) return [ int((self._h_rules[1]._y - self._h_rules[0]._y) * f) for f in header_factors ] def cell_polygon(self, cell: tuple[int, int]) ‑> tuple[tuple[int, int], tuple[int, int], tuple[int, int], tuple[int, int]]-
Expand source code
def cell_polygon( self, cell: tuple[int, int] ) -> tuple[tuple[int, int], tuple[int, int], tuple[int, int], tuple[int, int]]: """ Return points (x,y) that make up a polygon around the requested cell (top left, top right, bottom right, bottom left) """ row, col = cell self._check_col_idx(col) self._check_row_idx(row) top_rule = self._h_rules[row] bottom_rule = self._h_rules[row + 1] left_rule = self._v_rules[col] right_rule = self._v_rules[col + 1] # Calculate corner points using intersections top_left = top_rule.intersection(left_rule) top_right = top_rule.intersection(right_rule) bottom_left = bottom_rule.intersection(left_rule) bottom_right = bottom_rule.intersection(right_rule) if not all( [ point is not None for point in [top_left, top_right, bottom_left, bottom_right] ] ): raise TauluException("the lines around this cell do not intersect") return top_left, top_right, bottom_right, bottom_left # type:ignoreReturn points (x,y) that make up a polygon around the requested cell (top left, top right, bottom right, bottom left)
def cell_width(self, i: int) ‑> int-
Expand source code
def cell_width(self, i: int) -> int: self._check_col_idx(i) return int(self._v_rules[i + 1]._x - self._v_rules[i]._x) def cell_widths(self, start: int = 0) ‑> list[int]-
Expand source code
def cell_widths(self, start: int = 0) -> list[int]: return [self.cell_width(i) for i in range(start, self.cols)] def intersection(self, index: tuple[int, int]) ‑> tuple[float, float]-
Expand source code
def intersection(self, index: tuple[int, int]) -> tuple[float, float]: """ Returns the interaction of the index[0]th horizontal rule and the index[1]th vertical rule """ ints = self._h_rules[index[0]].intersection(self._v_rules[index[1]]) assert ints is not None return intsReturns the interaction of the index[0]th horizontal rule and the index[1]th vertical rule
def save(self, path: os.PathLike[str])-
Expand source code
@log_calls(level=logging.DEBUG) def save(self, path: PathLike[str]): """ Save the HeaderTemplate to the given path, as a json """ data = {"rules": [r.to_dict() for r in self._rules]} with open(path, "w") as f: json.dump(data, f)Save the HeaderTemplate to the given path, as a json
Inherited members