"""
Provide the Ancestry loading API.
"""
import logging
from xml.etree.ElementTree import Element
from html5lib import parse
from betty.asyncio import gather
from betty.fetch import Fetcher, FetchError
from betty.media_type import MediaType, InvalidMediaType
from betty.ancestry import Link, HasLinks
from betty.project import Project, ProjectEvent
[docs]
class LoadAncestryEvent(ProjectEvent):
"""
Dispatched to load ancestry data into a project.
"""
pass
[docs]
class PostLoadAncestryEvent(ProjectEvent):
"""
Dispatched to postprocess ancestry data that was loaded into a project.
This event is invoked immediately after :py:class:`betty.load.LoadAncestryEvent`.
"""
pass
async def _fetch_link_titles(project: Project) -> None:
await gather(
*(
_fetch_link_title(project.app.fetcher, link)
for entity in project.ancestry
if isinstance(entity, HasLinks)
for link in entity.links
)
)
async def _fetch_link_title(fetcher: Fetcher, link: Link) -> None:
if link.label is not None:
return
try:
response = await fetcher.fetch(link.url)
except FetchError as error:
logging.getLogger(__name__).warning(str(error))
return
try:
content_type = MediaType(response.headers["Content-Type"])
except InvalidMediaType:
return
if (content_type.type, content_type.subtype, content_type.suffix) not in (
("text", "html", None),
("application", "xhtml", "+xml"),
):
return
document = parse(response.text)
title = _extract_html_title(document)
if title is not None:
link.label = title
if not link.description:
description = _extract_html_meta_description(document)
if description is not None:
link.description = description
def _extract_html_title(document: Element) -> str | None:
head = document.find(
"ns:head",
namespaces={
"ns": "http://www.w3.org/1999/xhtml",
},
)
if head is None:
return None
title = head.find(
"ns:title",
namespaces={
"ns": "http://www.w3.org/1999/xhtml",
},
)
if title is None:
return None
return title.text
def _extract_html_meta_description(document: Element) -> str | None:
head = document.find(
"ns:head",
namespaces={
"ns": "http://www.w3.org/1999/xhtml",
},
)
if head is None:
return None
metas = head.findall(
"ns:meta",
namespaces={
"ns": "http://www.w3.org/1999/xhtml",
},
)
for attr_name, attr_value in (
("name", "description"),
("property", "og:description"),
):
for meta in metas:
if meta.get(attr_name, None) == attr_value:
return meta.get("content", None)
return None