Module uim.model.helpers.text_extractor
Expand source code
# -*- coding: utf-8 -*-
# Copyright © 2021 Wacom Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import string
import uuid
from typing import List, Tuple, Optional, Dict, Any
from uim.codec.parser.uim import UIMParser
from uim.model.helpers.treeiterator import PreOrderEnumerator
from uim.model.ink import InkModel
from uim.model.semantics.node import InkNode, StrokeGroupNode, StrokeNode
from uim.model.semantics.schema import WORD, TEXT_LINE, CommonViews, HAS_ALTERNATIVE, HAS_CONTENT, HAS_NAMED_ENTITY, \
HAS_URI, HAS_LABEL
def uim_extract_text_and_semantics(uim_bytes: bytes, hwr_view: str = CommonViews.HWR_VIEW.value,
ner_view: Optional[str] = None) \
-> Tuple[List[dict], List[dict]]:
"""
Extracting the text from Universal Ink Model.
Parameters
----------
uim_bytes: `bytes`
Byte array with RIFF file from Universal Ink Model
hwr_view: `str`
HWR view.
ner_view: `str`
NER view if needed.
Returns
-------
text: `List[dict]`
List of text lines. Each line has its own dict containing the bounding box, and all words
entities.
Raises
------
`InkModelException`
If the Universal Ink Model does not contain the view with the requested view name.
"""
uim_parser: UIMParser = UIMParser()
ink_object: InkModel = uim_parser.parse(uim_bytes)
return uim_extract_text_and_semantics_from(ink_object, hwr_view, ner_view)
def __collected_stroke_ids__(node: StrokeGroupNode) -> List[uuid.UUID]:
strokes: List[uuid.UUID] = []
for child in node.children:
if isinstance(child, StrokeNode):
strokes.append(child.stroke.id)
elif isinstance(child, StrokeGroupNode):
strokes.extend(__collected_stroke_ids__(child))
return strokes
def uim_extract_text_and_semantics_from(ink_model: InkModel, hwr_view: str = CommonViews.HWR_VIEW.value)\
-> Tuple[List[Dict[str, Any]], Dict[str, Dict[str, Any]], str]:
"""
Extracting the text from Universal Ink Model.
Parameters
----------
ink_model: InkModel -
Universal Ink Model
hwr_view: str -
Name of the HWR view.
Returns
-------
words: `List[dict]`
List of words. Each word has its own dict containing the text, bounding box, and all alternatives.
entities: `Dict[str, List[dict]]`
Dictionary of entities. Each entity has its own dict containing the label, instance, and path ids.
text: `str`
Text extracted from the Universal Ink Model.
Raises
------
`InkModelException`
If the Universal Ink Model does not contain the view with the requested view name.
Examples
--------
"""
text: str = ''
words: List[Dict[str, Any]] = []
entity_map: Dict[str, Dict[str, Any]] = {}
root: InkNode = ink_model.view_root(str(hwr_view))
text_nodes: Dict[str, str] = {}
text_alternatives: Dict[str, List[str]] = {}
text_lines: List[str] = []
entities: Dict[str, List[Dict[str, Any]]] = {}
# Iterate for triples with triple list and look for words
for s in ink_model.knowledge_graph.statements:
if s.predicate.startswith(HAS_ALTERNATIVE):
if s.subject not in text_alternatives:
text_alternatives[s.subject] = []
text_alternatives[s.subject].append(s.object)
# Collect all words
if s.object == WORD:
all_statements = ink_model.knowledge_graph.all_statements_for(s.subject, predicate=HAS_CONTENT)
if len(all_statements) == 1:
text_nodes[s.subject] = all_statements[0].object
# Collect all entities
if s.predicate == HAS_NAMED_ENTITY:
all_statements = ink_model.knowledge_graph.all_statements_for(s.object)
entity: Dict[str, Any] = {'instance': s.object}
for st in all_statements:
if st.predicate.startswith('hasPart'):
entity_map[st.object] = entity
elif st.predicate == HAS_URI:
entity['uri'] = st.object
elif st.predicate == HAS_LABEL:
entity['label'] = st.object
# Check for text lines
elif s.object == TEXT_LINE:
text_lines.append(s.subject)
# Position
pos: int = 0
# Iterate
for node in PreOrderEnumerator(root):
if node.uri in text_lines:
for word_node in node.children:
path_ids: List[str] = [str(p.stroke.id) for p in word_node.children if isinstance(p, StrokeNode)]
if word_node.uri in text_nodes:
alternatives: List[str] = text_alternatives.get(word_node.uri, [])
t = text_nodes[word_node.uri]
if t in string.punctuation or pos == 0:
text += t
else:
text += f' {t}'
words.append({
'alternatives': alternatives, 'text': t, 'path_id': path_ids, "word-uri": word_node.uri,
"bounding_box": {
'x': word_node.group_bounding_box.x,
'y': word_node.group_bounding_box.y,
'width': word_node.group_bounding_box.width,
'height': word_node.group_bounding_box.height
}
})
# Position
pos += 1
if word_node.uri in entity_map:
uri: str = entity_map[word_node.uri]['uri']
if uri not in entities:
entities[uri] = []
entities[uri].append(
{
'path_id': path_ids,
'label': entity_map[word_node.uri]['label'],
'instance': entity_map[word_node.uri]['instance']
}
)
text += '\n'
if text.endswith('\n'):
text = text[:-1]
return words, entities, text
Functions
def uim_extract_text_and_semantics(uim_bytes: bytes, hwr_view: str = 'hwr', ner_view: Optional[str] = None) ‑> Tuple[List[dict], List[dict]]
-
Extracting the text from Universal Ink Model.
Parameters
uim_bytes
:bytes
- Byte array with RIFF file from Universal Ink Model
hwr_view
:str
- HWR view.
ner_view
:str
- NER view if needed.
Returns
text
:List[dict]
- List of text lines. Each line has its own dict containing the bounding box, and all words
entities.
Raises
<code>InkModelException</code> If the Universal Ink Model does not contain the view with the requested view name.
Expand source code
def uim_extract_text_and_semantics(uim_bytes: bytes, hwr_view: str = CommonViews.HWR_VIEW.value, ner_view: Optional[str] = None) \ -> Tuple[List[dict], List[dict]]: """ Extracting the text from Universal Ink Model. Parameters ---------- uim_bytes: `bytes` Byte array with RIFF file from Universal Ink Model hwr_view: `str` HWR view. ner_view: `str` NER view if needed. Returns ------- text: `List[dict]` List of text lines. Each line has its own dict containing the bounding box, and all words entities. Raises ------ `InkModelException` If the Universal Ink Model does not contain the view with the requested view name. """ uim_parser: UIMParser = UIMParser() ink_object: InkModel = uim_parser.parse(uim_bytes) return uim_extract_text_and_semantics_from(ink_object, hwr_view, ner_view)
def uim_extract_text_and_semantics_from(ink_model: InkModel, hwr_view: str = 'hwr') ‑> Tuple[List[Dict[str, Any]], Dict[str, Dict[str, Any]], str]
-
Extracting the text from Universal Ink Model.
Parameters
ink_model
:InkModel -
- Universal Ink Model
hwr_view
:str -
Name of the HWR view.
Returns
words
:List[dict]
- List of words. Each word has its own dict containing the text, bounding box, and all alternatives.
entities
:Dict[str, List[dict]]
- Dictionary of entities. Each entity has its own dict containing the label, instance, and path ids.
text
:str
- Text extracted from the Universal Ink Model.
Raises
<code>InkModelException</code> If the Universal Ink Model does not contain the view with the requested view name.
Examples
Expand source code
def uim_extract_text_and_semantics_from(ink_model: InkModel, hwr_view: str = CommonViews.HWR_VIEW.value)\ -> Tuple[List[Dict[str, Any]], Dict[str, Dict[str, Any]], str]: """ Extracting the text from Universal Ink Model. Parameters ---------- ink_model: InkModel - Universal Ink Model hwr_view: str - Name of the HWR view. Returns ------- words: `List[dict]` List of words. Each word has its own dict containing the text, bounding box, and all alternatives. entities: `Dict[str, List[dict]]` Dictionary of entities. Each entity has its own dict containing the label, instance, and path ids. text: `str` Text extracted from the Universal Ink Model. Raises ------ `InkModelException` If the Universal Ink Model does not contain the view with the requested view name. Examples -------- """ text: str = '' words: List[Dict[str, Any]] = [] entity_map: Dict[str, Dict[str, Any]] = {} root: InkNode = ink_model.view_root(str(hwr_view)) text_nodes: Dict[str, str] = {} text_alternatives: Dict[str, List[str]] = {} text_lines: List[str] = [] entities: Dict[str, List[Dict[str, Any]]] = {} # Iterate for triples with triple list and look for words for s in ink_model.knowledge_graph.statements: if s.predicate.startswith(HAS_ALTERNATIVE): if s.subject not in text_alternatives: text_alternatives[s.subject] = [] text_alternatives[s.subject].append(s.object) # Collect all words if s.object == WORD: all_statements = ink_model.knowledge_graph.all_statements_for(s.subject, predicate=HAS_CONTENT) if len(all_statements) == 1: text_nodes[s.subject] = all_statements[0].object # Collect all entities if s.predicate == HAS_NAMED_ENTITY: all_statements = ink_model.knowledge_graph.all_statements_for(s.object) entity: Dict[str, Any] = {'instance': s.object} for st in all_statements: if st.predicate.startswith('hasPart'): entity_map[st.object] = entity elif st.predicate == HAS_URI: entity['uri'] = st.object elif st.predicate == HAS_LABEL: entity['label'] = st.object # Check for text lines elif s.object == TEXT_LINE: text_lines.append(s.subject) # Position pos: int = 0 # Iterate for node in PreOrderEnumerator(root): if node.uri in text_lines: for word_node in node.children: path_ids: List[str] = [str(p.stroke.id) for p in word_node.children if isinstance(p, StrokeNode)] if word_node.uri in text_nodes: alternatives: List[str] = text_alternatives.get(word_node.uri, []) t = text_nodes[word_node.uri] if t in string.punctuation or pos == 0: text += t else: text += f' {t}' words.append({ 'alternatives': alternatives, 'text': t, 'path_id': path_ids, "word-uri": word_node.uri, "bounding_box": { 'x': word_node.group_bounding_box.x, 'y': word_node.group_bounding_box.y, 'width': word_node.group_bounding_box.width, 'height': word_node.group_bounding_box.height } }) # Position pos += 1 if word_node.uri in entity_map: uri: str = entity_map[word_node.uri]['uri'] if uri not in entities: entities[uri] = [] entities[uri].append( { 'path_id': path_ids, 'label': entity_map[word_node.uri]['label'], 'instance': entity_map[word_node.uri]['instance'] } ) text += '\n' if text.endswith('\n'): text = text[:-1] return words, entities, text