Source code for scitex_writer.export._arxiv_categories

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# File: src/scitex_writer/export/_arxiv_categories.py

"""arXiv category data and suggestion algorithm.

Pure data structures and functions for arXiv subject classification.
No Django or ORM dependencies.
"""

from typing import Dict, List, Tuple

# Common arXiv categories
ARXIV_CATEGORIES: List[Dict[str, str]] = [
    # Computer Science
    {
        "code": "cs.AI",
        "name": "Artificial Intelligence",
        "description": "Covers all areas of AI",
    },
    {
        "code": "cs.CL",
        "name": "Computation and Language",
        "description": "Natural language processing, computational linguistics",
    },
    {
        "code": "cs.CV",
        "name": "Computer Vision and Pattern Recognition",
        "description": "Image processing, computer vision",
    },
    {
        "code": "cs.LG",
        "name": "Machine Learning",
        "description": "Machine learning research",
    },
    {
        "code": "cs.NI",
        "name": "Networking and Internet Architecture",
        "description": "Network protocols, internet architecture",
    },
    {
        "code": "cs.SE",
        "name": "Software Engineering",
        "description": "Software development, engineering practices",
    },
    # Mathematics
    {
        "code": "math.ST",
        "name": "Statistics Theory",
        "description": "Mathematical statistics",
    },
    {"code": "math.PR", "name": "Probability", "description": "Probability theory"},
    {
        "code": "math.NA",
        "name": "Numerical Analysis",
        "description": "Numerical methods and analysis",
    },
    # Physics
    {
        "code": "physics.comp-ph",
        "name": "Computational Physics",
        "description": "Computational methods in physics",
    },
    {
        "code": "physics.data-an",
        "name": "Data Analysis, Statistics and Probability",
        "description": "Data analysis in physics",
    },
    # Quantitative Biology
    {
        "code": "q-bio.BM",
        "name": "Biomolecules",
        "description": "Molecular biology, biochemistry",
    },
    {
        "code": "q-bio.GN",
        "name": "Genomics",
        "description": "Genomics and bioinformatics",
    },
    {
        "code": "q-bio.NC",
        "name": "Neurons and Cognition",
        "description": "Neuroscience, cognition",
    },
    # Statistics
    {"code": "stat.AP", "name": "Applications", "description": "Applied statistics"},
    {
        "code": "stat.ML",
        "name": "Machine Learning",
        "description": "Statistical machine learning",
    },
    # Electrical Engineering
    {
        "code": "eess.SP",
        "name": "Signal Processing",
        "description": "Signal processing, filtering, detection",
    },
]

# Keyword maps for category suggestion
CATEGORY_KEYWORDS: Dict[str, List[str]] = {
    "cs.AI": [
        "artificial intelligence",
        "ai",
        "machine learning",
        "deep learning",
        "neural network",
    ],
    "cs.CL": [
        "natural language",
        "nlp",
        "language model",
        "text processing",
        "linguistics",
    ],
    "cs.CV": ["computer vision", "image processing", "object detection", "recognition"],
    "cs.LG": ["machine learning", "learning algorithm", "classification", "regression"],
    "stat.ML": ["statistical learning", "bayesian", "statistics", "probabilistic"],
    "math.ST": ["statistics", "statistical theory", "hypothesis testing"],
    "q-bio.GN": ["genomics", "bioinformatics", "dna", "rna", "genome"],
    "q-bio.NC": ["neuroscience", "brain", "neural", "eeg", "cognition", "hippocampus"],
    "eess.SP": ["signal processing", "filtering", "spectral", "frequency", "wavelet"],
}


[docs] def suggest_categories( content: str, max_suggestions: int = 5, ) -> List[Tuple[str, str, int]]: """Suggest arXiv categories based on text content. Args: content: Text to analyse (typically title + abstract). max_suggestions: Maximum number of suggestions to return. Returns: List of (code, name, score) tuples sorted by relevance score. """ content_lower = content.lower() # Build a code -> name lookup from ARXIV_CATEGORIES code_to_name = {cat["code"]: cat["name"] for cat in ARXIV_CATEGORIES} suggestions: List[Tuple[str, str, int]] = [] for code, keywords in CATEGORY_KEYWORDS.items(): score = sum(1 for kw in keywords if kw in content_lower) if score > 0: name = code_to_name.get(code, code) suggestions.append((code, name, score)) suggestions.sort(key=lambda x: x[2], reverse=True) return suggestions[:max_suggestions]
# EOF