Source code for scitex_scholar.core.Papers

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Timestamp: "2025-09-30 22:24:29 (ywatanabe)"
# File: /home/ywatanabe/proj/SciTeX-Code/src/scitex/scholar/core/Papers.py
# ----------------------------------------
from __future__ import annotations

import os

__FILE__ = __file__
__DIR__ = os.path.dirname(__FILE__)
# ----------------------------------------

"""Papers class for SciTeX Scholar module.

Papers is a simple collection of Paper objects.
All business logic is handled by Scholar or utility functions.

This is a simplified version - reduced from 39 methods to ~15 methods.
Business logic has been moved to Scholar and utility functions.
"""

from pathlib import Path
from typing import Any, Callable, Dict, Iterator, List, Optional, Union

import scitex_logging as logging

from scitex_scholar.config import ScholarConfig
from scitex_scholar.core.Paper import Paper

logger = logging.getLogger(__name__)


[docs] class Papers: """A simple collection of Paper objects. This is a minimal collection class. Most business logic (loading, saving, enrichment, etc.) is handled by Scholar. Methods have been reduced from 39 to ~15 for simplicity. Complex operations should use Scholar or utility functions. """
[docs] def __init__( self, papers: Optional[Union[List[Paper], List[Dict]]] = None, project: Optional[str] = None, config: Optional[ScholarConfig] = None, ): """Initialize Papers collection. Args: papers: List of Paper objects or dicts to convert to Papers project: Project name for organizing papers config: Scholar configuration """ self.project = project or "default" self.config = config or ScholarConfig() # Initialize papers list self._papers: List[Paper] = [] if papers: for item in papers: if isinstance(item, Paper): self._papers.append(item) elif isinstance(item, dict): # Handle dict input - Pydantic handles validation paper = Paper.from_dict(item) self._papers.append(paper) else: logger.warning(f"Skipping invalid item type: {type(item)}")
# ========================================================================= # BASIC COLLECTION METHODS # =========================================================================
[docs] def __len__(self) -> int: """Number of papers in collection.""" return len(self._papers)
[docs] def __iter__(self) -> Iterator[Paper]: """Iterate over papers.""" return iter(self._papers)
[docs] def __getitem__(self, index: Union[int, slice]) -> Union[Paper, "Papers"]: """Get paper(s) by index or slice. Args: index: Integer index or slice Returns: Single Paper if integer index, Papers collection if slice """ if isinstance(index, slice): return Papers(self._papers[index], project=self.project, config=self.config) return self._papers[index]
[docs] def __repr__(self) -> str: """String representation.""" return f"Papers(count={len(self)}, project={self.project})"
[docs] def __str__(self) -> str: """Human-readable string.""" if len(self) == 0: return "Empty Papers collection" elif len(self) == 1: return "Papers collection with 1 paper" else: return f"Papers collection with {len(self)} papers"
[docs] def __dir__(self) -> List[str]: """Custom dir for better discoverability.""" base_attrs = list(object.__dir__(self)) custom_attrs = [ "papers", "filter", "sort_by", "append", "extend", "to_list", "summary", "to_dict", "to_dataframe", "from_bibtex", "save", ] return sorted(set(base_attrs + custom_attrs))
# ========================================================================= # SIMPLE COLLECTION OPERATIONS # ========================================================================= @property def papers(self) -> List[Paper]: """Get the underlying papers list.""" return self._papers
[docs] def append(self, paper: Paper) -> None: """Add a paper to the collection. Args: paper: Paper to add """ if isinstance(paper, Paper): self._papers.append(paper) else: logger.warning(f"Cannot append non-Paper object: {type(paper)}")
[docs] def extend(self, papers: Union[List[Paper], "Papers"]) -> None: """Add multiple papers to the collection. Args: papers: List of papers or another Papers collection """ if isinstance(papers, Papers): self._papers.extend(papers._papers) elif isinstance(papers, list): for paper in papers: if isinstance(paper, Paper): self._papers.append(paper) else: logger.warning(f"Cannot extend with type: {type(papers)}")
[docs] def to_list(self) -> List[Paper]: """Get papers as a list. Returns: List of Paper objects """ return list(self._papers)
[docs] def filter( self, condition: Optional[Callable[[Paper], bool]] = None, year_min: Optional[int] = None, year_max: Optional[int] = None, has_doi: Optional[bool] = None, has_abstract: Optional[bool] = None, has_pdf: Optional[bool] = None, min_citations: Optional[int] = None, max_citations: Optional[int] = None, min_impact_factor: Optional[float] = None, max_impact_factor: Optional[float] = None, journal: Optional[str] = None, author: Optional[str] = None, keyword: Optional[str] = None, publisher: Optional[str] = None, **kwargs, ) -> "Papers": """Filter papers by condition or criteria. Parameters ---------- condition Function that takes a Paper and returns bool. year_min Minimum year. year_max Maximum year. has_doi Filter papers with/without DOI. has_abstract Filter papers with/without abstract. has_pdf Filter papers with/without PDF URL. min_citations Minimum citation count. max_citations Maximum citation count. min_impact_factor Minimum journal impact factor. max_impact_factor Maximum journal impact factor. journal Journal name (partial match). author Author name (partial match). keyword Keyword (searches in keywords, title, abstract). publisher Publisher name (partial match). **kwargs Additional keyword arguments for backward compatibility. Returns ------- Papers New Papers collection with filtered papers. Examples -------- Filter using a lambda condition:: high_impact = papers.filter(lambda p: p.journal_impact_factor and p.journal_impact_factor > 10) highly_cited = papers.filter(lambda p: p.citation_count and p.citation_count > 500) recent = papers.filter(lambda p: p.year and p.year >= 2020) Filter using built-in parameters:: high_impact_v2 = papers.filter(min_impact_factor=10.0) highly_cited_v2 = papers.filter(min_citations=500) recent_v2 = papers.filter(year_min=2020) Combine multiple parameters:: filtered = papers.filter( min_impact_factor=5.0, min_citations=100, year_min=2015, year_max=2023, journal="Nature", has_doi=True, ) Chain filters for AND logic:: elite_recent = papers.filter(min_impact_factor=10).filter(year_min=2020) """ # If a lambda/function condition is provided, use it if condition is not None and callable(condition): filtered = [p for p in self._papers if condition(p)] logger.info(f"Lambda filter: {len(self._papers)} -> {len(filtered)} papers") return Papers(filtered, project=self.project, config=self.config) # Otherwise use criteria-based filtering from scitex_scholar._utils.papers_utils import filter_papers_advanced result = filter_papers_advanced( self, year_min=year_min, year_max=year_max, has_doi=has_doi, has_abstract=has_abstract, has_pdf=has_pdf, min_citations=min_citations or kwargs.get("min_citations"), max_citations=max_citations or kwargs.get("max_citations"), min_impact_factor=min_impact_factor or kwargs.get("min_impact_factor"), max_impact_factor=max_impact_factor or kwargs.get("max_impact_factor"), journal=journal, author=author, keyword=keyword, publisher=publisher, ) # Preserve project and config result.project = self.project result.config = self.config logger.info(f"Filtered: {len(self._papers)} -> {len(result)} papers") return result
[docs] def sort_by(self, *criteria, reverse: bool = False, **kwargs) -> "Papers": """Sort papers by criteria. Parameters ---------- *criteria Field names (as strings) or lambda functions to sort by. reverse Sort in descending order (default: False). **kwargs Additional options. Returns ------- Papers New sorted Papers collection. Notes ----- Available Paper fields for sorting: - ``title`` -- Paper title - ``year`` -- Publication year - ``citation_count`` -- Number of citations - ``journal_impact_factor`` -- Journal impact factor - ``journal`` -- Journal name - ``publisher`` -- Publisher name - ``doi`` -- Digital Object Identifier - ``created_at`` -- When record was created - ``updated_at`` -- When record was last updated Examples -------- Sort by a single field:: by_year = papers.sort_by('year') by_citations_desc = papers.sort_by('citation_count', reverse=True) Sort by multiple fields (primary, secondary, etc.):: by_year_then_citations = papers.sort_by('year', 'citation_count') Sort using a lambda function:: by_citations = papers.sort_by(lambda p: p.citation_count or 0, reverse=True) by_year_safe = papers.sort_by(lambda p: p.year if p.year else 9999) Sort by a computed value:: by_citation_per_year = papers.sort_by( lambda p: (p.citation_count or 0) / (2024 - p.year) if p.year else 0, reverse=True, ) """ if not criteria: return Papers(self._papers, project=self.project, config=self.config) # Handle single lambda if len(criteria) == 1 and callable(criteria[0]): sorted_papers = sorted(self._papers, key=criteria[0], reverse=reverse) return Papers(sorted_papers, project=self.project, config=self.config) # Handle field names from scitex_scholar._utils.papers_utils import sort_papers_multi return sort_papers_multi(self, list(criteria), reverse=reverse)
# ========================================================================= # BACKWARD COMPATIBILITY METHODS # These delegate to utilities or Scholar for the actual implementation # =========================================================================
[docs] @classmethod def from_bibtex(cls, bibtex_input: Union[str, Path]) -> "Papers": """Load papers from BibTeX. DEPRECATED: Use Scholar.from_bibtex() instead. This method is kept for backward compatibility. Args: bibtex_input: Path to BibTeX file or BibTeX string Returns: Papers collection """ logger.warning( "Papers.from_bibtex() is deprecated. Use Scholar.from_bibtex() instead." ) # Check if it's a file path if isinstance(bibtex_input, (str, Path)): path = Path(bibtex_input) if path.exists(): return cls._from_bibtex_file(path) # Otherwise treat as BibTeX text return cls._from_bibtex_text(str(bibtex_input))
[docs] @classmethod def _from_bibtex_file(cls, file_path: Union[str, Path]) -> "Papers": """Load papers from BibTeX file. Args: file_path: Path to BibTeX file Returns: Papers collection """ import bibtexparser file_path = Path(file_path) if not file_path.exists(): raise FileNotFoundError(f"BibTeX file not found: {file_path}") logger.info(f"Loading BibTeX from {file_path}") with open(file_path, "r", encoding="utf-8") as f: bib_db = bibtexparser.load(f) logger.info(f"Loaded {len(bib_db.entries)} BibTeX entries from {file_path}") papers = [] for entry in bib_db.entries: paper = cls._bibtex_entry_to_paper(entry) if paper: papers.append(paper) logger.success(f"Created {len(papers)} papers from BibTeX file") return cls(papers)
[docs] @classmethod def _from_bibtex_text(cls, bibtex_content: str) -> "Papers": """Load papers from BibTeX text. Args: bibtex_content: BibTeX content as string Returns: Papers collection """ import bibtexparser bib_db = bibtexparser.loads(bibtex_content) logger.info(f"Parsed {len(bib_db.entries)} BibTeX entries from text") papers = [] for entry in bib_db.entries: paper = cls._bibtex_entry_to_paper(entry) if paper: papers.append(paper) logger.success(f"Created {len(papers)} papers from BibTeX text") return cls(papers)
[docs] @staticmethod def _bibtex_entry_to_paper(entry: Dict[str, Any]) -> Paper: """Convert BibTeX entry to Paper object. Args: entry: BibTeX entry dictionary Returns: Paper object """ # Get fields from BibTeX entry fields = {k.lower(): v for k, v in entry.items()} # Parse authors authors = [] if "author" in fields: author_str = fields["author"] authors = [a.strip() for a in author_str.split(" and ")] # Parse year - let Pydantic handle validation year = None if "year" in fields: year_str = str(fields["year"]) if year_str.isdigit(): year = int(year_str) # Parse keywords keywords = [] if "keywords" in fields: keywords = [k.strip() for k in fields["keywords"].split(",")] # Create structured data for Paper basic_data = { "title": fields.get("title", "").strip("{}"), "authors": authors, "abstract": fields.get("abstract", ""), "year": year, "keywords": keywords, } id_data = { "doi": fields.get("doi"), "pmid": fields.get("pmid"), "arxiv_id": fields.get("arxiv"), } publication_data = { "journal": fields.get("journal"), } url_data = { "pdf": fields.get("url"), } # Create Paper with Pydantic structure paper = Paper() # Set basic metadata paper.metadata.basic.title = basic_data.get("title", "") paper.metadata.basic.authors = basic_data.get("authors") paper.metadata.basic.abstract = basic_data.get("abstract") paper.metadata.basic.year = basic_data.get("year") paper.metadata.basic.keywords = basic_data.get("keywords") # Set ID metadata if id_data.get("doi"): paper.metadata.set_doi(id_data["doi"]) paper.metadata.id.pmid = id_data.get("pmid") paper.metadata.id.arxiv_id = id_data.get("arxiv_id") # Set publication metadata paper.metadata.publication.journal = publication_data.get("journal") # Set URL metadata if url_data.get("pdf"): paper.metadata.url.pdfs.append({"url": url_data["pdf"], "source": "bibtex"}) # Store original BibTeX fields for later reconstruction paper._original_bibtex_fields = fields.copy() paper._bibtex_entry_type = entry.get("entry_type", "misc") paper._bibtex_key = entry.get("key", "") return paper
[docs] def save( self, output_path: Union[str, Path], format: Optional[str] = "auto", **kwargs, ) -> None: """Save papers to file. DEPRECATED: Use Scholar.save_papers() or Scholar.export_bibtex() instead. This method is kept for backward compatibility. Args: output_path: Path to save file format: Output format (auto, bibtex, json, csv) **kwargs: Additional options """ logger.warning( "Papers.save() is deprecated. Use Scholar.export_bibtex() instead." ) output_path = Path(output_path) # Auto-detect format from extension if format == "auto": ext = output_path.suffix.lower() if ext in [".bib", ".bibtex"]: format = "bibtex" elif ext == ".json": format = "json" elif ext == ".csv": format = "csv" else: format = "bibtex" output_path.parent.mkdir(parents=True, exist_ok=True) if format.lower() == "bibtex": from scitex_scholar._utils.papers_utils import papers_to_bibtex bibtex_content = papers_to_bibtex(self, output_path=None) output_path.write_text(bibtex_content) logger.success(f"Saved {len(self)} papers to {output_path}") elif format.lower() == "json": import json from scitex_scholar._utils.papers_utils import papers_to_dict with open(output_path, "w", encoding="utf-8") as f: json.dump(papers_to_dict(self), f, indent=2, ensure_ascii=False) logger.success(f"Saved {len(self)} papers to {output_path}") elif format.lower() == "csv": from scitex_scholar._utils.papers_utils import papers_to_dataframe df = papers_to_dataframe(self) df.to_csv(output_path, index=False) logger.success(f"Saved {len(self)} papers to {output_path}") else: raise ValueError(f"Unsupported format: {format}")
[docs] def to_dict(self) -> List[Dict[str, Any]]: """Convert to dictionary. DEPRECATED: Use papers_utils.papers_to_dict() for new code. Returns: Dictionary representation """ from scitex_scholar._utils.papers_utils import papers_to_dict return papers_to_dict(self)
[docs] def to_dataframe(self) -> Any: """Convert to pandas DataFrame. DEPRECATED: Use papers_utils.papers_to_dataframe() for new code. Returns: DataFrame with papers data """ try: from scitex_scholar._utils.papers_utils import papers_to_dataframe return papers_to_dataframe(self) except ImportError: logger.error("pandas is required for to_dataframe()") return None
[docs] def summary(self) -> Dict[str, Any]: """Get summary statistics. DEPRECATED: Use papers_utils.papers_statistics() for new code. Returns: Dictionary with statistics """ from scitex_scholar._utils.papers_utils import papers_statistics return papers_statistics(self)
# ========================================================================= # METHODS REMOVED (use Scholar or utilities instead): # ========================================================================= # The following methods have been removed to simplify the class: # - sync_with_library() -> Use Scholar internally # - create_project_symlinks() -> Use Scholar internally # - get_project_statistics() -> Use Scholar.get_library_statistics() # - download_pdfs() -> Use Scholar.download_pdfs() # - enrich() -> Use Scholar.enrich() # - merge_papers() -> Use papers_utils.merge_papers() # - deduplicate() -> Use papers_utils.deduplicate_papers() # # This reduces complexity from 39 methods to ~15 methods. # All business logic is now in Scholar or utility functions. # For backward compatibility __all__ = ["Papers"] if __name__ == "__main__": def main(): """Demonstrate simplified Papers class.""" print("=" * 60) print("Papers Class - Simplified Collection") print("=" * 60) # Create test papers # Create sample papers with Pydantic structure p1 = Paper() p1.metadata.basic.title = "Paper 1" p1.metadata.basic.year = 2023 p1.metadata.publication.journal = "Nature" p2 = Paper() p2.metadata.basic.title = "Paper 2" p2.metadata.basic.year = 2024 p2.metadata.publication.journal = "Science" p3 = Paper() p3.metadata.basic.title = "Paper 3" p3.metadata.basic.year = 2022 p3.metadata.publication.journal = "Cell" papers = Papers([p1, p2, p3]) print(f"\n1. Collection: {papers}") print(f" Count: {len(papers)}") print(f" First: {papers[0].metadata.basic.title}") # Test filtering recent = papers.filter( lambda p: p.metadata.basic.year and p.metadata.basic.year >= 2023 ) print(f"\n2. Filtered (year >= 2023): {len(recent)} papers") # Test sorting sorted_papers = papers.sort_by(lambda p: p.metadata.basic.year or 0) print("\n3. Sorted by year:") for p in sorted_papers: print(f" {p.metadata.basic.year}: {p.metadata.basic.title}") print("\n✅ Papers class simplified!") print(" - Reduced from 39 to ~15 methods") print(" - Business logic moved to Scholar") print(" - Clean collection interface") main() # EOF