Source code for projspec.proj.hf

from io import StringIO

from projspec.proj import ProjectSpec, ParseFailed

# Metadata keys that are specific to dataset cards and never appear on model cards.
# Used to distinguish the two repo types when both have a README.md.
_DATASET_DISCRIMINATORS = frozenset(
    [
        "task_categories",
        "task_ids",
        "dataset_info",
        "size_categories",
        "annotations_creators",
        "language_creators",
        "source_datasets",
        "configs",
    ]
)


[docs] class HuggingFaceRepo(ProjectSpec): icon = "🙂" spec_doc = "https://huggingface.co/docs/hub/en/model-cards" # full_spec = ("https://github.com/huggingface/huggingface_hub/blob/" # "main/src/huggingface_hub/templates/modelcard_template.md") # fields: language, library_name, tags, base_model, new_version, datasets # license, license_name, license_link, model-index (results) # Dataset names are the same as the repo names in HF. def match(self) -> bool: return "README.md" in self.proj.basenames def parse(self) -> None: from projspec.content.metadata import ( DescriptiveMetadata, License, Citation, ) import yaml with self.get_file("README.md") as f: txt = f.read() if txt.count("---\n") < 2: raise ParseFailed meta = txt.split("---\n")[1] try: meta = yaml.safe_load(StringIO(meta)) except Exception as e: raise ParseFailed from e if not isinstance(meta, dict): raise ParseFailed if { "dataset_info", "source_datasets", "task_categories", "task_ids", }.intersection(meta): raise ParseFailed("README.md is a dataset card") if "licence" in meta: self.contents["license"] = License( proj=self.proj, shortname=meta["licence"], fullname=meta.get("license_name"), url=meta.get("license_link"), ) for tag in meta.get("tags", []): if tag.startswith("arxiv:"): self._contents.setdefault("citations", []).append( Citation( proj=self.proj, meta=dict(arxiv=tag.removeprefix("arxiv:")), ) ) # TODO: datasets are links to other repos self.contents["descriptive_metadata"] = DescriptiveMetadata( proj=self.proj, meta={ k: meta[k] for k in [ "language", "library_name", "tags", "base_model", "new_version", ] if k in meta }, ) @staticmethod def _create(path: str) -> None: with open(f"{path}/README.md", "w") as f: f.write( """--- license: other license_name: coqui-public-model-license license_link: https://coqui.ai/cpml library_name: flair tags: - flair base_model: HuggingFaceH4/zephyr-7b-beta new_version: 0.1 datasets: - stanfordnlp/imdb - HuggingFaceFW/fineweb --- """ )
[docs] class HuggingFaceDataset(ProjectSpec): """A dataset repository hosted on the Hugging Face Hub. A HuggingFace dataset repo is identified by a ``README.md`` whose YAML front-matter contains at least one dataset-specific key (e.g. ``task_categories``, ``dataset_info``, ``size_categories``). Parsed contents --------------- descriptive_metadata Carries ``pretty_name``, ``language``, ``tags``, ``task_categories``, ``size_categories``, ``source_datasets``, ``annotations_creators``, and ``language_creators`` — whichever are present in the card. license *(optional)* Present when a ``license`` key is found in the front-matter. """ icon = "🗄️" # detailed spec: https://raw.githubusercontent.com/huggingface/hub-docs/refs/heads/main/datasetcard.md spec_doc = "https://huggingface.co/docs/hub/datasets-cards" def match(self) -> bool: return "README.md" in self.proj.basenames def parse(self) -> None: import yaml from projspec.content.metadata import ( DescriptiveMetadata, License, Citation, ) from projspec.content.data import TabularData try: with self.get_file("README.md") as f: txt = f.read() except OSError as exc: raise ParseFailed(f"Could not read README.md: {exc}") from exc if txt.count("---\n") < 2: raise ParseFailed("README.md has no YAML front-matter") try: meta = yaml.safe_load(StringIO(txt.split("---\n")[1])) except yaml.YAMLError as exc: raise ParseFailed(f"Invalid YAML front-matter: {exc}") from exc if not isinstance(meta, dict): raise ParseFailed("YAML front-matter did not parse to a mapping") if {"library_name", "base_model", "new_version"}.intersection(meta): raise ParseFailed("README.md is a dataset card") if "license" in meta: self._contents["license"] = License( proj=self.proj, shortname=meta["license"], fullname=meta.get("license_name", "unknown"), url=meta.get("license_link", ""), ) for tag in meta.get("tags", []): if tag.startswith("arxiv:"): self._contents.setdefault("citations", []).append( Citation( proj=self.proj, meta=dict(arxiv=tag.removeprefix("arxiv:")), ) ) # TODO: source_datasets are links to other datasets descriptive_keys = [ "pretty_name", "language", "tags", "task_categories", "task_ids", "size_categories", "source_datasets", "annotations_creators", "language_creators", "paperswithcode_id", ] card_meta = {k: meta[k] for k in descriptive_keys if k in meta} self._contents["descriptive_metadata"] = DescriptiveMetadata( proj=self.proj, meta=card_meta, ) if datasets := meta.get("dataset_info"): # only including configured tabular data for now if "config_name" in datasets[0]: self._contents["tabular_data"] = [ TabularData( name=data["config_name"], proj=self.proj, schema=data["features"], metadata={ k: data[k] for k in ("splits", "download_size", "dataset_size") if k in data }, ) for data in datasets if "features" in data ] else: if "features" in datasets: self._contents["tabular_data"] = TabularData( name="data", proj=self.proj, schema=datasets["features"], metadata={ k: datasets[k] for k in ("splits", "download_size", "dataset_size") if k in datasets }, ) @staticmethod def _create(path: str) -> None: """Scaffold a minimal but valid HuggingFace dataset card.""" with open(f"{path}/README.md", "w") as f: f.write( """\ --- pretty_name: My Dataset license: apache-2.0 language: - en tags: - text task_categories: - text-classification size_categories: - n<1K --- # My Dataset A short description of the dataset. """ )