Source code for ltp.mods_postprocessor

#! /usr/bin/env python
# -*- coding: utf-8 -*-
#
# Interpreter version: python 2.7
#
# Imports =====================================================================
import dhtmlparser


# Functions & objects =========================================================
[docs]def _remove_hairs(inp, hairs="/:;,- []<>()"): """ Remove "special" characters from beginning and the end of the `inp`. For example ``,a-sd,-/`` -> ``a-sd``. Args: inp (str): Input string. Returns: str: Cleaned string. """ while inp and inp[-1] in hairs: inp = inp[:-1] while inp and inp[0] in hairs: inp = inp[1:] return inp
[docs]def insert_tag(tag, before, root): """ Insert `tag` before `before` tag if present. If not, insert it into `root`. Args: tag (obj): HTMLElement instance. before (obj): HTMLElement instance. root (obj): HTMLElement instance. """ if not before: root.childs.append(tag) tag.parent = root return before = before[0] if type(before) in [tuple, list] else before # put it before first existing identifier parent = before.parent parent.childs.insert( parent.childs.index(before), tag ) tag.parent = parent
[docs]def transform_content(tags, content_transformer): """ Transform content in all `tags` using result of `content_transformer(tag)` call. Args: tags (obj/list): HTMLElement instance, or list of HTMLElement instances. content_transformer (function): Function which is called as ``content_transformer(tag)``. """ if type(tags) not in [tuple, list]: tags = [tags] for tag in tags: tag.childs = [ dhtmlparser.HTMLElement(content_transformer(tag)) ]
[docs]def postprocess_mods_volume(mods, uuid): """ Fix bugs in `mods` produced by XSLT template. Args: mods (str): XML string generated by XSLT template. uuid (str): UUID of the package. Returns: str: Updated XML. """ # do not parse already parsed dom's dom = mods if not isinstance(mods, dhtmlparser.HTMLElement): dom = dhtmlparser.parseString(mods) dhtmlparser.makeDoubleLinked(dom) # add missing parameter mods_tag = dom.find("mods:mods") if mods_tag: params = mods_tag[0].params # add missing attributes params["ID"] = "MODS_VOLUME_0001" params["xmlns:mods"] = "http://www.loc.gov/mods/v3" params["xmlns:xlink"] = "http://www.w3.org/1999/xlink" params["xmlns:xsi"] = "http://www.w3.org/2001/XMLSchema-instance" params["xsi:schemaLocation"] = ( "http://www.w3.org/2001/XMLSchema-instance " "http://www.w3.org/2001/XMLSchema.xsd " "http://www.loc.gov/mods/v3 " "http://www.loc.gov/standards/mods/v3/mods-3-4.xsd " "http://www.w3.org/1999/xlink http://www.w3.org/1999/xlink.xsd" ) # fix invalid type= paramater placeterm_tag = dom.match( "mods:originInfo", "mods:place", ["mods:placeTerm", {"authority": "marccountry"}] ) if placeterm_tag: placeterm_tag[0].params["type"] = "code" # add identifier to the section with identifiers if uuid: uuid_tag = dhtmlparser.HTMLElement( "mods:identifier", {"type": "uuid"}, [dhtmlparser.HTMLElement(uuid)] ) insert_tag(uuid_tag, dom.find("mods:identifier"), mods_tag) # add marccountry if not found marccountry = dom.find("mods:placeTerm", {"authority": "marccountry"}) if not marccountry: marccountry_tag = dhtmlparser.HTMLElement( "mods:place", [ dhtmlparser.HTMLElement( "mods:placeterm", {"type": "code", "authority": "marccountry"}, [dhtmlparser.HTMLElement("xr")] ) ] ) insert_tag( marccountry_tag, dom.match("mods:mods", "mods:originInfo", "mods:place"), dom.find("mods:originInfo")[0] ) # add <genre> tag if not found genre = dom.find( "mods:genre", fn=lambda x: x.getContent().lower().strip() == "electronic title" ) if not genre: genre_tag = dhtmlparser.HTMLElement( "mods:genre", [dhtmlparser.HTMLElement("electronic title")] ) insert_tag(genre_tag, dom.find("mods:originInfo"), mods_tag) # remove hairs from some tags transform_content( dom.match("mods:mods", "mods:titleInfo", "mods:title"), lambda x: _remove_hairs(x.getContent()) ) transform_content( dom.match( "mods:originInfo", "mods:place", ["mods:placeTerm", {"type": "text"}] ), lambda x: _remove_hairs(x.getContent()) ) prefix = '<?xml version="1.0" encoding="UTF-8"?>\n\n' return prefix + dom.prettify()