Source code for ltp.mods_postprocessor

#! /usr/bin/env python
# -*- coding: utf-8 -*-
#
# Interpreter version: python 2.7
#
# Imports =====================================================================
import dhtmlparser


# Functions & objects =========================================================
[docs]def _remove_hairs(inp, hairs="/:;,- []<>()"): """ Remove "special" characters from beginning and the end of the `inp`. For example ``,a-sd,-/`` -> ``a-sd``. Args: inp (str): Input string. Returns: str: Cleaned string. """ while inp and inp[-1] in hairs: inp = inp[:-1] while inp and inp[0] in hairs: inp = inp[1:] return inp
[docs]def insert_tag(tag, before, root): """ Insert `tag` before `before` tag if present. If not, insert it into `root`. Args: tag (obj): HTMLElement instance. before (obj): HTMLElement instance. root (obj): HTMLElement instance. """ if not before: root.childs.append(tag) tag.parent = root return before = before[0] if type(before) in [tuple, list] else before # put it before first existing identifier parent = before.parent parent.childs.insert( parent.childs.index(before), tag ) tag.parent = parent
[docs]def transform_content(tags, content_transformer): """ Transform content in all `tags` using result of `content_transformer(tag)` call. Args: tags (obj/list): HTMLElement instance, or list of HTMLElement instances. content_transformer (function): Function which is called as ``content_transformer(tag)``. """ if type(tags) not in [tuple, list]: tags = [tags] for tag in tags: tag.childs = [ dhtmlparser.HTMLElement(content_transformer(tag)) ]
[docs]def postprocess_mods(mods, package_id=None): """ Fix bugs in `mods` produced by XSLT template. Args: mods (str): XML string generated by XSLT template. package_id (str, default None): UUID of the package. Returns: str: Updated XML. """ dom = dhtmlparser.parseString(mods) dhtmlparser.makeDoubleLinked(dom) # add missing parameter mods_tag = dom.find("mods:mods") if mods_tag: mods_tag[0].params["ID"] = "MODS_TITLE_0001" # fix invalid type= paramater placeterm_tag = dom.match( "mods:originInfo", "mods:place", ["mods:placeTerm", {"authority": "marccountry"}] ) if placeterm_tag: placeterm_tag[0].params["type"] = "code" # add identifier to the section with identifiers if package_id: uuid_tag = dhtmlparser.HTMLElement( "mods:identifier", {"type": "uuid"}, [dhtmlparser.HTMLElement(package_id)] ) insert_tag(uuid_tag, dom.find("mods:identifier"), mods_tag) # add marccountry if not found marccountry = dom.find("mods:placeTerm", {"authority": "marccountry"}) if not marccountry: marccountry_tag = dhtmlparser.HTMLElement( "mods:place", [ dhtmlparser.HTMLElement( "mods:placeterm", {"type": "code", "authority": "marccountry"}, [dhtmlparser.HTMLElement("xr")] ) ] ) insert_tag( marccountry_tag, dom.match("mods:mods", "mods:originInfo", "mods:place"), dom.find("mods:originInfo")[0] ) # add <genre> tag if not found genre = dom.find( "mods:genre", fn=lambda x: x.getContent().lower().strip() == "electronic title" ) if not genre: genre_tag = dhtmlparser.HTMLElement( "mods:genre", [dhtmlparser.HTMLElement("electronic title")] ) insert_tag(genre_tag, dom.find("mods:originInfo"), mods_tag) # remove hairs from some tags transform_content( dom.match("mods:mods", "mods:titleInfo", "mods:title"), lambda x: _remove_hairs(x.getContent()) ) transform_content( dom.match( "mods:originInfo", "mods:place", ["mods:placeTerm", {"type": "text"}] ), lambda x: _remove_hairs(x.getContent()) ) return dom.prettify()