#!/usr/bin/env python3
# File: ./src/scitex/scholar/filters.py
"""
Pure-function paper filtering for scitex_scholar.
Works on plain dicts only — no Django ORM or model imports required.
Expected paper dict keys:
title : str
authors : list[str]
journal : str
year : int or str
citations : int or str
impact_factor : float or str or None
is_open_access: bool
source : str
snippet : str (optional, used for doc_type detection)
"""
from typing import Any, Dict, List, Optional
[docs]
def apply_filters(
papers: List[Dict[str, Any]],
filters: Optional[Dict[str, Any]] = None,
parsed_operators: Optional[Dict[str, Any]] = None,
) -> List[Dict[str, Any]]:
"""Filter a list of paper dicts by various criteria.
Parameters
----------
papers
List of paper dicts. Each dict should contain the keys described in
the module docstring; missing keys are treated as empty / zero values.
filters
Dict of filter criteria extracted from a search form or URL
parameters. Supported keys:
- ``year_from``, ``year_to`` -- year range (int)
- ``min_citations``, ``max_citations`` -- citation range (int)
- ``min_impact_factor`` -- minimum IF (float)
- ``max_impact_factor`` -- maximum IF (float)
- ``authors`` -- list of author name strings (legacy)
- ``journal`` -- journal name substring (legacy, str)
- ``open_access`` -- bool
- ``doc_type`` -- ``"review"`` | ``"preprint"`` | other
- ``language`` -- language string (``"english"`` passes)
parsed_operators
Dict produced by ``SearchQueryParser.from_shell_syntax()`` or the
equivalent ``parse_query_operators()`` function from scitex-cloud.
Supported keys:
- ``title_includes``, ``title_excludes`` -- list[str]
- ``author_includes``, ``author_excludes`` -- list[str]
- ``journal_includes``, ``journal_excludes`` -- list[str]
- ``year_min``, ``year_max`` -- int
- ``citations_min``, ``citations_max`` -- int
- ``impact_factor_min``, ``impact_factor_max`` -- float
Returns
-------
list of dict
Filtered list of paper dicts (same objects, not copies).
"""
if not filters and not parsed_operators:
return papers
filtered: List[Dict[str, Any]] = []
for paper in papers:
# ------------------------------------------------------------------
# Title includes / excludes (from parsed_operators)
# ------------------------------------------------------------------
if parsed_operators:
title = paper.get("title", "").lower()
if parsed_operators.get("title_includes"):
if not all(
term.lower() in title for term in parsed_operators["title_includes"]
):
continue
if parsed_operators.get("title_excludes"):
if any(
term.lower() in title for term in parsed_operators["title_excludes"]
):
continue
# ------------------------------------------------------------------
# Author includes / excludes (from parsed_operators)
# ------------------------------------------------------------------
authors_text = " ".join(paper.get("authors", [])).lower()
if parsed_operators.get("author_includes"):
if not all(
term.lower() in authors_text
for term in parsed_operators["author_includes"]
):
continue
if parsed_operators.get("author_excludes"):
if any(
term.lower() in authors_text
for term in parsed_operators["author_excludes"]
):
continue
# ------------------------------------------------------------------
# Journal includes / excludes (from parsed_operators)
# ------------------------------------------------------------------
journal_name = paper.get("journal", "").lower()
if parsed_operators.get("journal_includes"):
if not all(
term.lower() in journal_name
for term in parsed_operators["journal_includes"]
):
continue
if parsed_operators.get("journal_excludes"):
if any(
term.lower() in journal_name
for term in parsed_operators["journal_excludes"]
):
continue
# ------------------------------------------------------------------
# Year range (filters take precedence, parsed_operators can override)
# ------------------------------------------------------------------
year_from = filters.get("year_from") if filters else None
year_to = filters.get("year_to") if filters else None
if parsed_operators:
year_from = parsed_operators.get("year_min") or year_from
year_to = parsed_operators.get("year_max") or year_to
if year_from is not None or year_to is not None:
try:
year = int(paper.get("year", 0))
if year_from is not None and year < year_from:
continue
if year_to is not None and year > year_to:
continue
except (ValueError, TypeError):
continue
# ------------------------------------------------------------------
# Citation count
# ------------------------------------------------------------------
min_citations = filters.get("min_citations") if filters else None
max_citations = filters.get("max_citations") if filters else None
if parsed_operators:
min_citations = parsed_operators.get("citations_min") or min_citations
max_citations = parsed_operators.get("citations_max") or max_citations
if min_citations is not None or max_citations is not None:
try:
citations = int(paper.get("citations", 0))
if min_citations is not None and citations < min_citations:
continue
if max_citations is not None and citations > max_citations:
continue
except (ValueError, TypeError):
continue
# ------------------------------------------------------------------
# Impact factor
# ------------------------------------------------------------------
min_if = filters.get("min_impact_factor") if filters else None
max_if = filters.get("max_impact_factor") if filters else None
if parsed_operators:
min_if = parsed_operators.get("impact_factor_min") or min_if
max_if = parsed_operators.get("impact_factor_max") or max_if
if min_if is not None or max_if is not None:
try:
impact_factor = float(paper.get("impact_factor", 0) or 0)
if min_if is not None and impact_factor < min_if:
continue
if max_if is not None and impact_factor > max_if:
continue
except (ValueError, TypeError):
continue
# ------------------------------------------------------------------
# Legacy author filter (filters["authors"] is a list of name strings)
# ------------------------------------------------------------------
if filters and filters.get("authors"):
authors_text = " ".join(paper.get("authors", [])).lower()
if not any(name.lower() in authors_text for name in filters["authors"]):
continue
# ------------------------------------------------------------------
# Legacy journal filter (filters["journal"] is a substring)
# ------------------------------------------------------------------
if filters and filters.get("journal"):
journal_name = paper.get("journal", "").lower()
if filters["journal"].lower() not in journal_name:
continue
# ------------------------------------------------------------------
# Open access
# ------------------------------------------------------------------
if filters and filters.get("open_access") and not paper.get("is_open_access"):
continue
# ------------------------------------------------------------------
# Document type (basic heuristic)
# ------------------------------------------------------------------
if filters and filters.get("doc_type"):
title_and_snippet = (
paper.get("title", "") + " " + paper.get("snippet", "")
).lower()
doc_type = filters["doc_type"].lower()
if doc_type == "review" and "review" not in title_and_snippet:
continue
elif (
doc_type == "preprint"
and "preprint" not in paper.get("source", "").lower()
):
continue
# ------------------------------------------------------------------
# Language (basic: only "english" passes)
# ------------------------------------------------------------------
if filters and filters.get("language"):
if filters["language"].lower() != "english":
continue
filtered.append(paper)
return filtered
# EOF