Source code for scitex_scholar.formatting

#!/usr/bin/env python3
"""Citation formatting for plain paper dicts.

Provides BibTeX, RIS, EndNote, CSV, and text-style (APA, MLA, Chicago,
Vancouver) formatters.  Every function accepts a standard paper dict —
no ORM or framework dependencies.

Standard paper dict keys::

    title, authors_str, journal, year, doi, pmid, arxiv_id, url,
    abstract, document_type, citation_count, impact_factor,
    is_open_access, source, volume, number, pages, cite_key
"""

from __future__ import annotations

import re
from typing import Dict, List

# ── Type mappings ───────────────────────────────────────────────

DOC_TYPE_TO_BIBTEX = {
    "article": "article",
    "preprint": "misc",
    "book": "book",
    "chapter": "inbook",
    "conference": "inproceedings",
    "thesis": "phdthesis",
    "report": "techreport",
    "dataset": "misc",
}

DOC_TYPE_TO_RIS = {
    "article": "JOUR",
    "book": "BOOK",
    "chapter": "CHAP",
    "conference": "CONF",
    "thesis": "THES",
}

DOC_TYPE_TO_ENDNOTE = {
    "article": "Journal Article",
    "book": "Book",
    "chapter": "Book Section",
    "conference": "Conference Paper",
    "thesis": "Thesis",
}

FORMAT_EXTENSIONS = {
    "bibtex": ".bib",
    "endnote": ".enw",
    "ris": ".ris",
    "csv": ".csv",
    "json": ".json",
}

# ── Text citation style templates ───────────────────────────────

CITATION_STYLES: Dict[str, Dict[str, str]] = {
    "apa": {
        "dataset": "{authors} ({year}). {title} [Data set]. {publisher}. {doi}",
        "article": (
            "{authors} ({year}). {title}. {journal}, {volume}({issue}), {pages}. {doi}"
        ),
    },
    "mla": {
        "dataset": '"{title}." {publisher}, {year}, {doi}.',
        "article": (
            '{authors}. "{title}." {journal}, vol. {volume}, '
            "no. {issue}, {year}, pp. {pages}. {doi}."
        ),
    },
    "chicago": {
        "dataset": '{authors}. "{title}." {publisher}, {year}. {doi}.',
        "article": (
            '{authors}. "{title}." {journal} {volume}, '
            "no. {issue} ({year}): {pages}. {doi}."
        ),
    },
    "vancouver": {
        "dataset": (
            "{authors}. {title} [dataset]. {publisher}; {year}. Available from: {doi}"
        ),
        "article": (
            "{authors}. {title}. "
            "{journal}. {year};{volume}({issue}):{pages}. "
            "Available from: {doi}"
        ),
    },
}

# ── Normalisation helpers ───────────────────────────────────────


[docs] def clean_text(text: str) -> str: """Remove characters that break citation formats and normalise whitespace.""" if not text: return "" text = re.sub(r"[{}\[\]\\]", "", text) return re.sub(r"\s+", " ", text).strip()
[docs] def generate_cite_key(paper: dict) -> str: """Generate a BibTeX citation key from a paper dict.""" authors = paper.get("authors_str") or "unknown" first_author = authors.split(",")[0].split()[-1] if authors else "unknown" first_author = re.sub(r"[^a-zA-Z]", "", first_author).lower() year = str(paper.get("year") or "XXXX") return f"{first_author}{year}"
[docs] def paper_normalize(data: dict) -> dict: """Normalise a raw dict (e.g. API search result) to a standard paper dict.""" return { "title": data.get("title") or "Unknown", "authors_str": data.get("authors") or data.get("author") or "", "journal": (data.get("journal") or "").replace(r"\s*\(IF.*\)", ""), "year": str(data.get("year") or ""), "doi": data.get("doi") or data.get("DOI") or "", "pmid": data.get("pmid") or "", "arxiv_id": data.get("arxiv_id") or "", "url": ( data.get("externalUrl") or data.get("external_url") or data.get("url") or data.get("pdf_url") or "" ), "abstract": data.get("abstract") or data.get("snippet") or "", "document_type": data.get("document_type") or "article", "citation_count": data.get("citations") or data.get("citation_count") or 0, "impact_factor": data.get("impact_factor") or 0, "is_open_access": data.get("is_open_access", False), "source": data.get("source") or "unknown", "volume": data.get("volume") or "", "number": data.get("number") or "", "pages": data.get("pages") or "", }
# ── BibTeX ──────────────────────────────────────────────────────
[docs] def to_bibtex(paper: dict) -> str: """Format a standard paper dict as a BibTeX entry.""" doc_type = paper.get("document_type", "article") entry_type = DOC_TYPE_TO_BIBTEX.get(doc_type, "article") key = paper.get("cite_key") or generate_cite_key(paper) lines = [f"@{entry_type}{{{key},"] title = clean_text(paper.get("title") or "") if title: lines.append(f" title = {{{title}}},") authors = paper.get("authors_str") or "" lines.append(f" author = {{{authors or 'Unknown'}}},") journal = clean_text(paper.get("journal") or "") if journal: lines.append(f" journal = {{{journal}}},") if paper.get("year"): lines.append(f" year = {{{paper['year']}}},") for field in ("volume", "number", "pages"): if paper.get(field): lines.append(f" {field} = {{{paper[field]}}},") if paper.get("doi"): lines.append(f" doi = {{{paper['doi']}}},") if paper.get("pmid"): lines.append(f" pmid = {{{paper['pmid']}}},") if paper.get("arxiv_id"): lines.append(f" eprint = {{{paper['arxiv_id']}}},") lines.append(" archivePrefix = {arXiv},") if paper.get("url"): lines.append(f" url = {{{paper['url']}}},") if paper.get("abstract"): abstract = clean_text(paper["abstract"]) if len(abstract) > 500: abstract = abstract[:500] + "..." lines.append(f" abstract = {{{abstract}}},") if lines[-1].endswith(","): lines[-1] = lines[-1][:-1] lines.append("}") return "\n".join(lines)
# ── RIS ─────────────────────────────────────────────────────────
[docs] def to_ris(paper: dict) -> str: """Format a standard paper dict as a RIS entry.""" doc_type = paper.get("document_type", "article") ris_type = DOC_TYPE_TO_RIS.get(doc_type, "GEN") lines = [f"TY - {ris_type}"] title = clean_text(paper.get("title") or "") if title: lines.append(f"TI - {title}") authors = paper.get("authors_str") or "" if authors: for author in re.split(r"\s+and\s+|,\s*", authors): author = author.strip() if author: lines.append(f"AU - {author}") journal = clean_text(paper.get("journal") or "") if journal: lines.append(f"JO - {journal}") if paper.get("year"): lines.append(f"PY - {paper['year']}") if paper.get("doi"): lines.append(f"DO - {paper['doi']}") if paper.get("url"): lines.append(f"UR - {paper['url']}") if paper.get("abstract"): abstract = clean_text(paper["abstract"]) if len(abstract) > 1000: abstract = abstract[:1000] + "..." lines.append(f"AB - {abstract}") lines.append("ER - ") return "\n".join(lines)
# ── EndNote ─────────────────────────────────────────────────────
[docs] def to_endnote(paper: dict) -> str: """Format a standard paper dict as an EndNote entry.""" doc_type = paper.get("document_type", "article") endnote_type = DOC_TYPE_TO_ENDNOTE.get(doc_type, "Generic") lines = [f"%0 {endnote_type}"] title = clean_text(paper.get("title") or "") if title: lines.append(f"%T {title}") authors = paper.get("authors_str") or "" if authors: for author in re.split(r"\s+and\s+|,\s*", authors): author = author.strip() if author: lines.append(f"%A {author}") journal = clean_text(paper.get("journal") or "") if journal: lines.append(f"%J {journal}") if paper.get("year"): lines.append(f"%D {paper['year']}") if paper.get("doi"): lines.append(f"%R {paper['doi']}") if paper.get("url"): lines.append(f"%U {paper['url']}") if paper.get("abstract"): abstract = clean_text(paper["abstract"]) if len(abstract) > 1000: abstract = abstract[:1000] + "..." lines.append(f"%X {abstract}") return "\n".join(lines)
# ── CSV ─────────────────────────────────────────────────────────
[docs] def to_csv_row(paper: dict) -> dict: """Format a standard paper dict as a CSV row dict.""" return { "Title": clean_text(paper.get("title") or ""), "Authors": paper.get("authors_str") or "", "Journal": clean_text(paper.get("journal") or ""), "Year": paper.get("year") or "", "DOI": paper.get("doi") or "", "PMID": paper.get("pmid") or "", "URL": paper.get("url") or "", "Abstract": clean_text(paper.get("abstract") or ""), }
# ── Text citation styles ───────────────────────────────────────
[docs] def to_text_citation( paper: dict, style: str = "apa", doc_type: str = "article", ) -> str: """Format a paper dict as a text citation in the given style. Parameters ---------- paper : dict Standard paper dict. style : str One of ``apa``, ``mla``, ``chicago``, ``vancouver``. doc_type : str One of ``article``, ``dataset``. Returns ------- str Formatted citation string. """ templates = CITATION_STYLES.get(style, CITATION_STYLES["apa"]) template = templates.get(doc_type, templates.get("article", "")) data = { "authors": paper.get("authors_str") or "Unknown", "year": paper.get("year") or "", "title": clean_text(paper.get("title") or ""), "journal": clean_text(paper.get("journal") or ""), "volume": paper.get("volume") or "", "issue": paper.get("number") or "", "pages": paper.get("pages") or "", "doi": paper.get("doi") or paper.get("url") or "", "publisher": paper.get("publisher") or "", } try: return template.format(**data) except KeyError: return ( f"{data['authors']} ({data['year']}). {data['title']}. {data['journal']}." )
# ── arXiv BibTeX cleaning ────────────────────────────────────── _BIBLATEX_TO_BIBTEX = { "journaltitle": "journal", "location": "address", "date": "year", } _ARXIV_UNSUPPORTED_FIELDS = ["url", "urldate", "file", "abstract"]
[docs] def clean_bibtex_for_arxiv(bibtex_entry: str) -> str: """Clean a BibTeX entry for arXiv compatibility. Converts biblatex fields to standard bibtex and removes unsupported fields (url, urldate, file, abstract). """ for biblatex_field, bibtex_field in _BIBLATEX_TO_BIBTEX.items(): bibtex_entry = re.sub( rf"\b{biblatex_field}\s*=", f"{bibtex_field} =", bibtex_entry ) for field in _ARXIV_UNSUPPORTED_FIELDS: bibtex_entry = re.sub( rf"\s*{field}\s*=\s*\{{[^}}]*\}},?\s*", "", bibtex_entry, flags=re.IGNORECASE, ) bibtex_entry = re.sub(r",\s*}", "\n}", bibtex_entry) return bibtex_entry
# ── Batch helpers ─────────────────────────────────────────────── _FORMAT_FUNCS = { "bibtex": to_bibtex, "ris": to_ris, "endnote": to_endnote, }
[docs] def papers_to_format(papers: List[dict], fmt: str) -> str: """Format a list of paper dicts to the given format string.""" func = _FORMAT_FUNCS.get(fmt) if not func: raise ValueError(f"Unsupported format: {fmt}. Use: {', '.join(_FORMAT_FUNCS)}") return "\n\n".join(func(p) for p in papers)
# ── Search result normalization ─────────────────────────────────
[docs] def paper_from_search_result(result: dict) -> dict: """Normalize a raw search-API result dict to the standard paper format. Handles field aliases from different search engines (externalUrl, snippet, etc.) and fills missing fields with safe defaults. """ journal = result.get("journal") or "" import re as _re journal = _re.sub(r"\s*\(IF[^)]*\)", "", journal) return { "title": result.get("title") or "Unknown", "authors": result.get("authors") or "", "journal": journal, "year": str(result.get("year") or ""), "doi": result.get("doi") or result.get("DOI") or "", "pmid": result.get("pmid") or "", "arxiv_id": result.get("arxiv_id") or "", "citations": result.get("citations") or result.get("citation_count") or 0, "impact_factor": result.get("impact_factor") or 0, "is_open_access": result.get("is_open_access", False), "abstract": result.get("abstract") or result.get("snippet") or "", "url": ( result.get("externalUrl") or result.get("external_url") or result.get("pdf_url") or "" ), "source": result.get("source") or "unknown", }
[docs] def make_citation_key(last_name: str, year=None) -> str: """Generate a citation key from author last name and year. Args: last_name: Author last name (special chars stripped). year: Publication year (optional). Returns ------- Citation key string, e.g. ``smith2024``. """ import re as _re name = _re.sub(r"[^a-zA-Z]", "", last_name).lower() return f"{name}{year}" if year else name
[docs] def sanitize_filename(filename: str, max_length: int = 50) -> str: """Sanitize a string for use as a download filename. Replaces shell-unsafe characters with underscores, collapses whitespace, and truncates to *max_length* characters. """ import re as _re filename = _re.sub(r'[<>:"/\\|?*]', "_", filename) filename = filename[:max_length] return _re.sub(r"\s+", "_", filename.strip())
# ── Public API ────────────────────────────────────────────────── __all__ = [ "clean_text", "generate_cite_key", "paper_normalize", "paper_from_search_result", "make_citation_key", "sanitize_filename", "to_bibtex", "to_ris", "to_endnote", "to_csv_row", "to_text_citation", "clean_bibtex_for_arxiv", "papers_to_format", "FORMAT_EXTENSIONS", "DOC_TYPE_TO_BIBTEX", "DOC_TYPE_TO_RIS", "DOC_TYPE_TO_ENDNOTE", "CITATION_STYLES", ] # EOF