This file is a merged representation of a subset of the codebase, containing files not matching ignore patterns, combined into a single document by Repomix.

<file_summary>
This section contains a summary of this file.

<purpose>
This file contains a packed representation of the entire repository's contents.
It is designed to be easily consumable by AI systems for analysis, code review,
or other automated processes.
</purpose>

<file_format>
The content is organized as follows:
1. This summary section
2. Repository information
3. Directory structure
4. Repository files (if enabled)
4. Repository files, each consisting of:
  - File path as an attribute
  - Full contents of the file
</file_format>

<usage_guidelines>
- This file should be treated as read-only. Any changes should be made to the
  original repository files, not this packed version.
- When processing this file, use the file path to distinguish
  between different files in the repository.
- Be aware that this file may contain sensitive information. Handle it with
  the same level of security as you would the original repository.
</usage_guidelines>

<notes>
- Some files may have been excluded based on .gitignore rules and Repomix's configuration
- Binary files are not included in this packed representation. Please refer to the Repository Structure section for a complete list of file paths, including binary files
- Files matching these patterns are excluded: varia, .specstory
- Files matching patterns in .gitignore are excluded
- Files matching default ignore patterns are excluded
- Files are sorted by Git change count (files with more changes are at the bottom)
</notes>

<additional_info>

</additional_info>

</file_summary>

<directory_structure>
.claude/
  settings.local.json
.cursor/
  rules/
    alignment-algorithms.mdc
    frame-matching-models.mdc
    video-composition-rules.mdc
    video-processing-flow.mdc
.giga/
  specifications.json
.github/
  workflows/
    push.yml
    release.yml
src/
  vidkompy/
    core/
      __init__.py
      alignment_engine.py
      dtw_aligner.py
      frame_fingerprint.py
      multi_resolution_aligner.py
      numba_optimizations.py
      precise_temporal_alignment.py
      spatial_alignment.py
      temporal_alignment.py
      tunnel_aligner.py
      video_processor.py
    __init__.py
    __main__.py
    __version__.py
    models.py
    vidkompy.py
.cursorrules
.gitignore
.pre-commit-config.yaml
AGENT.md
benchmark.sh
CHANGELOG.md
CLAUDE.md
LICENSE
package.toml
PROGRESS.md
pyproject.toml
README.md
SPEC.md
TODO.md
</directory_structure>

<files>
This section contains the contents of the repository's files.

<file path=".giga/specifications.json">
[
  {
    "fileName": "main-overview.mdc",
    "description": "Complete system overview covering the video overlay architecture, key components, and processing pipeline flow"
  },
  {
    "fileName": "alignment-algorithms.mdc",
    "description": "Detailed documentation of the spatial and temporal alignment algorithms, including template matching, feature matching, and frame/audio synchronization techniques"
  },
  {
    "fileName": "video-processing-flow.mdc",
    "description": "End-to-end data flow documentation covering video ingestion, frame processing, alignment calculations, overlay composition, and output generation"
  },
  {
    "fileName": "frame-matching-models.mdc",
    "description": "Technical specifications for frame matching and mapping models, including similarity scoring, keyframe selection, and frame rate handling logic"
  },
  {
    "fileName": "video-composition-rules.mdc",
    "description": "Documentation of the business rules and constraints for video composition, including foreground preservation, audio handling, and quality thresholds"
  }
]
</file>

<file path=".github/workflows/push.yml">
name: Build & Test

on:
  push:
    branches: [main]
    tags-ignore: ["v*"]
  pull_request:
    branches: [main]
  workflow_dispatch:

permissions:
  contents: write
  id-token: write

concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: true

jobs:
  quality:
    name: Code Quality
    runs-on: ubuntu-latest
    steps:
      - name: Checkout code
        uses: actions/checkout@v4
        with:
          fetch-depth: 0

      - name: Run Ruff lint
        uses: astral-sh/ruff-action@v3
        with:
          version: "latest"
          args: "check --output-format=github"

      - name: Run Ruff Format
        uses: astral-sh/ruff-action@v3
        with:
          version: "latest"
          args: "format --check --respect-gitignore"

  test:
    name: Run Tests
    needs: quality
    strategy:
      matrix:
        python-version: ["3.10", "3.11", "3.12"]
        os: [ubuntu-latest]
      fail-fast: true
    runs-on: ${{ matrix.os }}
    steps:
      - name: Checkout code
        uses: actions/checkout@v4

      - name: Set up Python
        uses: actions/setup-python@v5
        with:
          python-version: ${{ matrix.python-version }}

      - name: Install UV
        uses: astral-sh/setup-uv@v5
        with:
          version: "latest"
          python-version: ${{ matrix.python-version }}
          enable-cache: true
          cache-suffix: ${{ matrix.os }}-${{ matrix.python-version }}

      - name: Install test dependencies
        run: |
          uv pip install --system --upgrade pip
          uv pip install --system ".[test]"

      - name: Run tests with Pytest
        run: uv run pytest -n auto --maxfail=1 --disable-warnings --cov-report=xml --cov-config=pyproject.toml --cov=src/vidkompo --cov=tests tests/

      - name: Upload coverage report
        uses: actions/upload-artifact@v4
        with:
          name: coverage-${{ matrix.python-version }}-${{ matrix.os }}
          path: coverage.xml

  build:
    name: Build Distribution
    needs: test
    runs-on: ubuntu-latest
    steps:
      - name: Checkout code
        uses: actions/checkout@v4

      - name: Set up Python
        uses: actions/setup-python@v5
        with:
          python-version: "3.12"

      - name: Install UV
        uses: astral-sh/setup-uv@v5
        with:
          version: "latest"
          python-version: "3.12"
          enable-cache: true

      - name: Install build tools
        run: uv pip install build hatchling hatch-vcs

      - name: Build distributions
        run: uv run python -m build --outdir dist

      - name: Upload distribution artifacts
        uses: actions/upload-artifact@v4
        with:
          name: dist-files
          path: dist/
          retention-days: 5
</file>

<file path=".github/workflows/release.yml">
name: Release

on:
  push:
    tags: ["v*"]

permissions:
  contents: write
  id-token: write

jobs:
  release:
    name: Release to PyPI
    runs-on: ubuntu-latest
    environment:
      name: pypi
      url: https://pypi.org/p/vidkompo
    steps:
      - name: Checkout code
        uses: actions/checkout@v4
        with:
          fetch-depth: 0

      - name: Set up Python
        uses: actions/setup-python@v5
        with:
          python-version: "3.12"

      - name: Install UV
        uses: astral-sh/setup-uv@v5
        with:
          version: "latest"
          python-version: "3.12"
          enable-cache: true

      - name: Install build tools
        run: uv pip install build hatchling hatch-vcs

      - name: Build distributions
        run: uv run python -m build --outdir dist

      - name: Verify distribution files
        run: |
          ls -la dist/
          test -n "$(find dist -name '*.whl')" || (echo "Wheel file missing" && exit 1)
          test -n "$(find dist -name '*.tar.gz')" || (echo "Source distribution missing" && exit 1)

      - name: Publish to PyPI
        uses: pypa/gh-action-pypi-publish@release/v1
        with:
          password: ${{ secrets.PYPI_TOKEN }}

      - name: Create GitHub Release
        uses: softprops/action-gh-release@v1
        with:
          files: dist/*
          generate_release_notes: true
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
</file>

<file path="src/vidkompy/core/numba_optimizations.py">
#!/usr/bin/env python3
# this_file: src/vidkompy/core/numba_optimizations.py

"""
Numba-optimized functions for vidkompy performance bottlenecks.

These JIT-compiled functions provide significant speedups for 
computationally intensive operations like DTW and fingerprint comparisons.
"""

import numpy as np
from numba import jit, prange
from loguru import logger


# ==============================================================================
# DTW Algorithm Optimizations (5-20x speedup)
# ==============================================================================

@jit(nopython=True, parallel=True, cache=True)
def compute_dtw_cost_matrix_numba(
    fg_features: np.ndarray, 
    bg_features: np.ndarray, 
    window: int
) -> np.ndarray:
    """
    Optimized DTW cost matrix computation using Numba.
    
    This replaces the nested loops in DTWAligner._build_dtw_matrix()
    with a parallelized JIT-compiled version.
    
    Args:
        fg_features: Foreground feature vectors (N, D)
        bg_features: Background feature vectors (M, D)  
        window: Sakoe-Chiba band width
        
    Returns:
        DTW cost matrix (N+1, M+1)
    """
    n_fg, n_bg = fg_features.shape[0], bg_features.shape[0]
    
    # Initialize DTW matrix
    dtw = np.full((n_fg + 1, n_bg + 1), np.inf, dtype=np.float64)
    dtw[0, 0] = 0.0
    
    # Compute pairwise distances first (can be parallelized)
    distances = np.zeros((n_fg, n_bg), dtype=np.float64)
    for i in prange(n_fg):
        for j in range(n_bg):
            # Euclidean distance between feature vectors
            dist = 0.0
            for k in range(fg_features.shape[1]):
                diff = fg_features[i, k] - bg_features[j, k]
                dist += diff * diff
            distances[i, j] = np.sqrt(dist)
    
    # Normalize distances to [0, 1]
    max_dist = np.max(distances)
    if max_dist > 0:
        distances = distances / max_dist
    
    # Fill DTW matrix with Sakoe-Chiba band constraint
    for i in range(1, n_fg + 1):
        j_start = max(1, i - window)
        j_end = min(n_bg + 1, i + window)
        
        for j in range(j_start, j_end):
            cost = distances[i - 1, j - 1]
            
            # DTW recursion
            dtw[i, j] = cost + min(
                dtw[i - 1, j],      # Insertion
                dtw[i, j - 1],      # Deletion  
                dtw[i - 1, j - 1]   # Match
            )
    
    return dtw


@jit(nopython=True, cache=True)
def find_dtw_path_numba(dtw_matrix: np.ndarray) -> np.ndarray:
    """
    Optimized DTW path finding using Numba.
    
    This replaces the backtracking loop in DTWAligner._find_optimal_path()
    with a JIT-compiled version.
    
    Args:
        dtw_matrix: Computed DTW cost matrix
        
    Returns:
        Optimal path as array of (i, j) pairs
    """
    n_fg, n_bg = dtw_matrix.shape
    n_fg -= 1
    n_bg -= 1
    
    # Use a pre-allocated array for path (worst case size)
    max_path_length = n_fg + n_bg
    path_array = np.zeros((max_path_length, 2), dtype=np.int32)
    path_idx = 0
    
    # Backtrack to find path
    i, j = n_fg, n_bg
    
    while i > 0 and j > 0:
        path_array[path_idx, 0] = i - 1  # Convert to 0-based indices
        path_array[path_idx, 1] = j - 1
        path_idx += 1
        
        # Choose direction with minimum cost
        if i == 1:
            j -= 1
        elif j == 1:
            i -= 1
        else:
            costs = np.array([
                dtw_matrix[i - 1, j],      # From above
                dtw_matrix[i, j - 1],      # From left
                dtw_matrix[i - 1, j - 1]   # From diagonal
            ])
            
            min_idx = np.argmin(costs)
            if min_idx == 0:
                i -= 1
            elif min_idx == 1:
                j -= 1
            else:
                i -= 1
                j -= 1
    
    # Add remaining path
    while i > 0:
        i -= 1
        path_array[path_idx, 0] = i
        path_array[path_idx, 1] = 0
        path_idx += 1
    while j > 0:
        j -= 1
        path_array[path_idx, 0] = 0
        path_array[path_idx, 1] = j
        path_idx += 1
    
    # Trim and reverse
    path_array = path_array[:path_idx]
    return path_array[::-1]


# ==============================================================================
# Fingerprint Similarity Optimizations (3-10x speedup)
# ==============================================================================

@jit(nopython=True, parallel=True, cache=True)
def compute_hamming_distances_batch(
    hashes1: np.ndarray, 
    hashes2: np.ndarray
) -> np.ndarray:
    """
    Batch computation of Hamming distances between hash arrays.
    
    This accelerates fingerprint comparisons when comparing many
    frames at once.
    
    Args:
        hashes1: First set of hashes (N, hash_size)
        hashes2: Second set of hashes (M, hash_size)
        
    Returns:
        Distance matrix (N, M)
    """
    n1, n2 = hashes1.shape[0], hashes2.shape[0]
    distances = np.zeros((n1, n2), dtype=np.float64)
    
    for i in prange(n1):
        for j in range(n2):
            # Hamming distance
            distance = 0.0
            for k in range(hashes1.shape[1]):
                if hashes1[i, k] != hashes2[j, k]:
                    distance += 1.0
            distances[i, j] = distance
    
    return distances


@jit(nopython=True, cache=True)
def compute_histogram_correlation(hist1: np.ndarray, hist2: np.ndarray) -> float:
    """
    Optimized histogram correlation computation.
    
    Replaces cv2.compareHist for faster execution when comparing
    many histograms.
    
    Args:
        hist1: First histogram
        hist2: Second histogram
        
    Returns:
        Correlation coefficient
    """
    # Normalize histograms
    sum1 = np.sum(hist1)
    sum2 = np.sum(hist2)
    
    if sum1 == 0 or sum2 == 0:
        return 0.0
    
    norm1 = hist1 / sum1
    norm2 = hist2 / sum2
    
    # Compute correlation
    mean1 = np.mean(norm1)
    mean2 = np.mean(norm2)
    
    numerator = 0.0
    denom1 = 0.0
    denom2 = 0.0
    
    for i in range(len(norm1)):
        diff1 = norm1[i] - mean1
        diff2 = norm2[i] - mean2
        numerator += diff1 * diff2
        denom1 += diff1 * diff1
        denom2 += diff2 * diff2
    
    denominator = np.sqrt(denom1 * denom2)
    if denominator == 0:
        return 0.0
    
    return numerator / denominator


@jit(nopython=True, cache=True)
def compute_weighted_similarity(
    hash_distances: np.ndarray,
    hist_correlation: float,
    weights: np.ndarray
) -> float:
    """
    Compute weighted similarity score from multiple hash distances.
    
    Args:
        hash_distances: Array of normalized hash distances
        hist_correlation: Histogram correlation score
        weights: Weight for each hash type
        
    Returns:
        Combined similarity score (0-1)
    """
    # Convert distances to similarities
    hash_similarities = 1.0 - hash_distances
    
    # Weighted sum
    weighted_sum = 0.0
    weight_sum = 0.0
    
    for i in range(len(hash_similarities)):
        weighted_sum += hash_similarities[i] * weights[i]
        weight_sum += weights[i]
    
    # Add histogram correlation with its weight
    if len(weights) > len(hash_similarities):
        weighted_sum += hist_correlation * weights[-1]
        weight_sum += weights[-1]
    
    if weight_sum == 0:
        return 0.0
    
    return weighted_sum / weight_sum


# ==============================================================================
# Helper Functions for Integration
# ==============================================================================

def prepare_fingerprints_for_numba(
    fingerprints: dict[int, dict[str, np.ndarray]]
) -> tuple[np.ndarray, dict[str, np.ndarray]]:
    """
    Convert fingerprint dictionary to numpy arrays for Numba processing.
    
    Args:
        fingerprints: Dictionary of frame fingerprints
        
    Returns:
        Tuple of (frame_indices, feature_matrix)
    """
    # Extract frame indices
    indices = np.array(sorted(fingerprints.keys()), dtype=np.int32)
    
    # Collect all hash types
    sample_fp = next(iter(fingerprints.values()))
    hash_types = [k for k in sample_fp.keys() if k != "histogram"]
    
    # Build feature matrix
    features = {}
    for hash_type in hash_types:
        hash_list = []
        for idx in indices:
            hash_data = fingerprints[idx][hash_type]
            hash_list.append(hash_data.flatten())
        features[hash_type] = np.array(hash_list)
    
    # Add histograms separately
    if "histogram" in sample_fp:
        hist_list = []
        for idx in indices:
            hist_list.append(fingerprints[idx]["histogram"])
        features["histogram"] = np.array(hist_list)
    
    return indices, features


def log_numba_compilation():
    """Log information about Numba JIT compilation."""
    logger.info("Numba JIT compilation in progress...")
    logger.info("First run will be slower due to compilation")
    logger.info("Subsequent runs will be significantly faster")
</file>

<file path="src/vidkompy/core/tunnel_aligner.py">
#!/usr/bin/env python3
# this_file: src/vidkompy/core/tunnel_aligner.py
"""Tunnel-based temporal alignment using direct frame comparison with sliding windows."""

from __future__ import annotations

from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import TYPE_CHECKING

import cv2
import numpy as np
from loguru import logger
from rich.console import Console
from rich.progress import Progress, SpinnerColumn, TextColumn

from vidkompy.models import FrameAlignment

console = Console()


@dataclass
class TunnelConfig:
    """Configuration for tunnel-based alignment."""
    window_size: int = 30
    downsample_factor: float = 1.0
    early_stop_threshold: float = 0.05
    merge_strategy: str = "average"  # "average" or "confidence_weighted"
    mask_threshold: int = 10  # For tunnel_mask
    mask_erosion: int = 2  # For tunnel_mask


class TunnelAligner(ABC):
    """Base class for tunnel-based temporal alignment."""
    
    def __init__(self, config: TunnelConfig | None = None):
        """Initialize tunnel aligner with configuration."""
        self.config = config or TunnelConfig()
        self._frame_cache: dict[tuple[str, int], np.ndarray] = {}
        self._cache_hits = 0
        self._cache_misses = 0
        
    def align(
        self,
        fg_frames: np.ndarray,
        bg_frames: np.ndarray,
        x_offset: int,
        y_offset: int,
        verbose: bool = False
    ) -> tuple[list[FrameAlignment], float]:
        """Perform tunnel-based temporal alignment.
        
        Args:
            fg_frames: Foreground frames array
            bg_frames: Background frames array
            x_offset: X offset for spatial alignment
            y_offset: Y offset for spatial alignment
            verbose: Enable verbose logging
            
        Returns:
            Tuple of (frame alignments, confidence score)
        """
        if verbose:
            logger.info(f"Starting tunnel alignment with {len(fg_frames)} FG and {len(bg_frames)} BG frames")
            logger.info(f"Config: window_size={self.config.window_size}, downsample={self.config.downsample_factor}")
        
        # Downsample frames if requested
        if self.config.downsample_factor < 1.0:
            fg_frames = self._downsample_frames(fg_frames)
            bg_frames = self._downsample_frames(bg_frames)
            if verbose:
                logger.info(f"Downsampled to {len(fg_frames)} FG and {len(bg_frames)} BG frames")
        
        # Perform forward pass
        with Progress(
            SpinnerColumn(),
            TextColumn("[progress.description]{task.description}"),
            console=console,
            transient=True,
        ) as progress:
            task = progress.add_task("Forward pass...", total=len(fg_frames))
            forward_mapping = self._forward_pass(
                fg_frames, bg_frames, x_offset, y_offset, progress, task, verbose
            )
            progress.update(task, completed=len(fg_frames))
        
        # Perform backward pass
        with Progress(
            SpinnerColumn(),
            TextColumn("[progress.description]{task.description}"),
            console=console,
            transient=True,
        ) as progress:
            task = progress.add_task("Backward pass...", total=len(fg_frames))
            backward_mapping = self._backward_pass(
                fg_frames, bg_frames, x_offset, y_offset, progress, task, verbose
            )
            progress.update(task, completed=len(fg_frames))
        
        # Merge mappings
        final_mapping, confidence = self._merge_mappings(
            forward_mapping, backward_mapping, len(fg_frames), verbose
        )
        
        if verbose:
            logger.info(f"Cache stats: {self._cache_hits} hits, {self._cache_misses} misses")
            logger.info(f"Final confidence: {confidence:.3f}")
        
        # Convert to FrameAlignment objects
        alignments = []
        for fg_idx, bg_idx in enumerate(final_mapping):
            alignments.append(FrameAlignment(
                fg_frame_idx=fg_idx,
                bg_frame_idx=int(bg_idx),
                similarity_score=1.0  # Individual confidence could be computed
            ))
        
        return alignments, confidence
    
    def _downsample_frames(self, frames: np.ndarray) -> np.ndarray:
        """Downsample frames by the configured factor."""
        if self.config.downsample_factor >= 1.0:
            return frames
        
        downsampled = []
        for frame in frames:
            h, w = frame.shape[:2]
            new_h = int(h * self.config.downsample_factor)
            new_w = int(w * self.config.downsample_factor)
            downsampled.append(cv2.resize(frame, (new_w, new_h), interpolation=cv2.INTER_AREA))
        
        return np.array(downsampled)
    
    def _forward_pass(
        self,
        fg_frames: np.ndarray,
        bg_frames: np.ndarray,
        x_offset: int,
        y_offset: int,
        progress: Progress,
        task: int,
        verbose: bool
    ) -> list[int]:
        """Perform forward pass from start to end."""
        mapping = []
        last_bg_idx = 0
        
        for fg_idx in range(len(fg_frames)):
            # Define search window
            window_start = last_bg_idx
            window_end = min(last_bg_idx + self.config.window_size, len(bg_frames))
            
            # Find best match in window
            best_bg_idx, best_diff = self._find_best_match(
                fg_frames[fg_idx],
                bg_frames[window_start:window_end],
                x_offset,
                y_offset,
                window_start
            )
            
            mapping.append(best_bg_idx)
            last_bg_idx = best_bg_idx
            
            # Early stopping if perfect match found
            if best_diff < self.config.early_stop_threshold:
                last_bg_idx = max(last_bg_idx, best_bg_idx + 1)
            
            progress.update(task, advance=1)
            
            if verbose and fg_idx % 100 == 0:
                logger.debug(f"Forward: FG {fg_idx} -> BG {best_bg_idx} (diff={best_diff:.3f})")
        
        return mapping
    
    def _backward_pass(
        self,
        fg_frames: np.ndarray,
        bg_frames: np.ndarray,
        x_offset: int,
        y_offset: int,
        progress: Progress,
        task: int,
        verbose: bool
    ) -> list[int]:
        """Perform backward pass from end to start."""
        mapping = [0] * len(fg_frames)
        last_bg_idx = len(bg_frames) - 1
        
        for fg_idx in range(len(fg_frames) - 1, -1, -1):
            # Define search window
            window_end = last_bg_idx + 1
            window_start = max(0, last_bg_idx - self.config.window_size + 1)
            
            # Find best match in window
            best_bg_idx, best_diff = self._find_best_match(
                fg_frames[fg_idx],
                bg_frames[window_start:window_end],
                x_offset,
                y_offset,
                window_start
            )
            
            mapping[fg_idx] = best_bg_idx
            last_bg_idx = best_bg_idx
            
            # Early stopping if perfect match found
            if best_diff < self.config.early_stop_threshold:
                last_bg_idx = min(last_bg_idx, best_bg_idx - 1)
            
            progress.update(task, advance=1)
            
            if verbose and fg_idx % 100 == 0:
                logger.debug(f"Backward: FG {fg_idx} -> BG {best_bg_idx} (diff={best_diff:.3f})")
        
        return mapping
    
    def _find_best_match(
        self,
        fg_frame: np.ndarray,
        bg_window: np.ndarray,
        x_offset: int,
        y_offset: int,
        window_offset: int
    ) -> tuple[int, float]:
        """Find best matching BG frame in window for given FG frame."""
        best_idx = 0
        best_diff = float('inf')
        
        for i, bg_frame in enumerate(bg_window):
            diff = self.compute_frame_difference(fg_frame, bg_frame, x_offset, y_offset)
            
            if diff < best_diff:
                best_diff = diff
                best_idx = i
                
                # Early exit if very good match found
                if diff < self.config.early_stop_threshold:
                    break
        
        return window_offset + best_idx, best_diff
    
    def _merge_mappings(
        self,
        forward: list[int],
        backward: list[int],
        num_frames: int,
        verbose: bool
    ) -> tuple[list[int], float]:
        """Merge forward and backward mappings."""
        merged = []
        total_diff = 0.0
        
        if self.config.merge_strategy == "average":
            # Simple average
            for i in range(num_frames):
                merged_idx = (forward[i] + backward[i]) / 2.0
                merged.append(merged_idx)
                total_diff += abs(forward[i] - backward[i])
        
        elif self.config.merge_strategy == "confidence_weighted":
            # Weight by consistency between forward/backward
            for i in range(num_frames):
                diff = abs(forward[i] - backward[i])
                if diff < 5:  # High consistency
                    weight = 0.5
                else:  # Lower consistency, trust forward more
                    weight = 0.7
                merged_idx = weight * forward[i] + (1 - weight) * backward[i]
                merged.append(merged_idx)
                total_diff += diff
        
        # Ensure monotonicity
        for i in range(1, len(merged)):
            if merged[i] <= merged[i-1]:
                merged[i] = merged[i-1] + 0.1
        
        # Compute confidence based on forward/backward consistency
        avg_diff = total_diff / num_frames if num_frames > 0 else 0
        confidence = max(0.0, 1.0 - (avg_diff / 10.0))  # Normalize to 0-1
        
        if verbose:
            logger.info(f"Merge: avg difference = {avg_diff:.2f}, confidence = {confidence:.3f}")
        
        return merged, confidence
    
    @abstractmethod
    def compute_frame_difference(
        self,
        fg_frame: np.ndarray,
        bg_frame: np.ndarray,
        x_offset: int,
        y_offset: int
    ) -> float:
        """Compute difference between FG and BG frames.
        
        To be implemented by subclasses.
        """
        pass


class TunnelFullAligner(TunnelAligner):
    """Tunnel aligner using full frame comparison."""
    
    def compute_frame_difference(
        self,
        fg_frame: np.ndarray,
        bg_frame: np.ndarray,
        x_offset: int,
        y_offset: int
    ) -> float:
        """Compute pixel-wise difference between frames."""
        fg_h, fg_w = fg_frame.shape[:2]
        
        # Crop BG frame to FG region
        bg_cropped = bg_frame[y_offset:y_offset+fg_h, x_offset:x_offset+fg_w]
        
        # Handle size mismatch
        if bg_cropped.shape[:2] != fg_frame.shape[:2]:
            return float('inf')
        
        # Compute pixel-wise difference
        diff = np.abs(fg_frame.astype(float) - bg_cropped.astype(float))
        
        # Return mean absolute difference
        return np.mean(diff)


class TunnelMaskAligner(TunnelAligner):
    """Tunnel aligner using masked frame comparison."""
    
    def __init__(self, config: TunnelConfig | None = None):
        """Initialize with mask generation."""
        super().__init__(config)
        self._mask_cache: dict[tuple[int, int], np.ndarray] = {}
    
    def align(
        self,
        fg_frames: np.ndarray,
        bg_frames: np.ndarray,
        x_offset: int,
        y_offset: int,
        verbose: bool = False
    ) -> tuple[list[FrameAlignment], float]:
        """Perform alignment with mask generation."""
        # Generate mask from first few FG frames
        self._generate_mask(fg_frames[:10])
        
        if verbose:
            mask_coverage = np.sum(self._mask) / self._mask.size
            logger.info(f"Generated mask with {mask_coverage:.1%} coverage")
        
        # Perform standard alignment
        return super().align(fg_frames, bg_frames, x_offset, y_offset, verbose)
    
    def _generate_mask(self, sample_frames: np.ndarray):
        """Generate content mask from sample frames."""
        if len(sample_frames) == 0:
            return
        
        h, w = sample_frames[0].shape[:2]
        
        # Accumulate non-black pixels across samples
        accumulator = np.zeros((h, w), dtype=np.float32)
        
        for frame in sample_frames:
            # Convert to grayscale if needed
            if len(frame.shape) == 3:
                gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
            else:
                gray = frame
            
            # Add pixels above threshold
            accumulator += (gray > self.config.mask_threshold).astype(float)
        
        # Create mask where majority of samples had content
        mask = (accumulator > len(sample_frames) / 2).astype(np.uint8)
        
        # Erode to avoid edge artifacts
        if self.config.mask_erosion > 0:
            kernel = np.ones((self.config.mask_erosion, self.config.mask_erosion), np.uint8)
            mask = cv2.erode(mask, kernel, iterations=1)
        
        self._mask = mask
    
    def compute_frame_difference(
        self,
        fg_frame: np.ndarray,
        bg_frame: np.ndarray,
        x_offset: int,
        y_offset: int
    ) -> float:
        """Compute masked pixel difference between frames."""
        fg_h, fg_w = fg_frame.shape[:2]
        
        # Crop BG frame to FG region
        bg_cropped = bg_frame[y_offset:y_offset+fg_h, x_offset:x_offset+fg_w]
        
        # Handle size mismatch
        if bg_cropped.shape[:2] != fg_frame.shape[:2]:
            return float('inf')
        
        # Apply mask to both frames
        if hasattr(self, '_mask') and self._mask is not None:
            # Resize mask to match current frame size if needed
            if self._mask.shape[:2] != (fg_h, fg_w):
                mask_resized = cv2.resize(self._mask, (fg_w, fg_h), interpolation=cv2.INTER_NEAREST)
            else:
                mask_resized = self._mask
            
            fg_masked = fg_frame * mask_resized[..., np.newaxis]
            bg_masked = bg_cropped * mask_resized[..., np.newaxis]
            mask_sum = np.sum(mask_resized)
        else:
            # Fallback to full frame if no mask
            fg_masked = fg_frame
            bg_masked = bg_cropped
            mask_sum = fg_h * fg_w
        
        if mask_sum == 0:
            return float('inf')
        
        # Compute pixel-wise difference in masked region
        diff = np.abs(fg_masked.astype(float) - bg_masked.astype(float))
        
        # Return mean absolute difference normalized by mask area
        return np.sum(diff) / (mask_sum * fg_frame.shape[2] if len(fg_frame.shape) == 3 else mask_sum)
</file>

<file path=".pre-commit-config.yaml">
repos:
  - repo: https://github.com/astral-sh/ruff-pre-commit
    rev: v0.3.4
    hooks:
      - id: ruff
        args: [--fix]
      - id: ruff-format
        args: [--respect-gitignore]
  - repo: https://github.com/pre-commit/pre-commit-hooks
    rev: v4.5.0
    hooks:
      - id: trailing-whitespace
      - id: check-yaml
      - id: check-toml
      - id: check-added-large-files
      - id: debug-statements
      - id: check-case-conflict
      - id: mixed-line-ending
        args: [--fix=lf]
</file>

<file path="LICENSE">
MIT License

Copyright (c) 2025 Adam Twardoch

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
</file>

<file path="package.toml">
# Package configuration
[package]
include_cli = true        # Include CLI boilerplate
include_logging = true    # Include logging setup
use_pydantic = true      # Use Pydantic for data validation
use_rich = true          # Use Rich for terminal output

[features]
mkdocs = false           # Enable MkDocs documentation
vcs = true              # Initialize Git repository
github_actions = true   # Add GitHub Actions workflows
</file>

<file path="src/vidkompy/core/__init__.py">
# this_file: src/vidkompy/core/__init__.py
</file>

<file path="src/vidkompy/__init__.py">
# this_file: src/vidkompy/__init__.py

from .__version__ import __version__

__all__ = ["__version__"]
</file>

<file path="AGENT.md">
## Coding style

<guidelines>
# When you write code

- Iterate gradually, avoiding major changes
- Minimize confirmations and checks
- Preserve existing code/structure unless necessary
- Use constants over magic numbers
- Check for existing solutions in the codebase before starting
- Check often the coherence of the code you’re writing with the rest of the code.
- Focus on minimal viable increments and ship early
- Write explanatory docstrings/comments that explain what and WHY this does, explain where and how the code is used/referred to elsewhere in the code
- Analyze code line-by-line
- Handle failures gracefully with retries, fallbacks, user guidance
- Address edge cases, validate assumptions, catch errors early
- Let the computer do the work, minimize user decisions
- Reduce cognitive load, beautify code
- Modularize repeated logic into concise, single-purpose functions
- Favor flat over nested structures
- Consistently keep, document, update and consult the holistic overview mental image of the codebase. 

Work in rounds: 

- Create `PROGRESS.md` as a detailed flat plan with `[ ]` items. 
- Identify the most important TODO items, and create `TODO.md` with `[ ]` items. 
- Implement the changes. 
- Update `PROGRESS.md` and `TODO.md` as you go. 
- After each round of changes, update `CHANGELOG.md` with the changes.
- Update `README.md` to reflect the changes.

## Keep track of paths

In each source file, maintain the up-to-date `this_file` record that shows the path of the current file relative to project root. Place the `this_file` record near the top of the file, as a comment after the shebangs, or in the YAML Markdown frontmatter.

## When you write Python

- Use `uv pip`, never `pip`
- Use `python -m` when running code
- PEP 8: Use consistent formatting and naming
- Write clear, descriptive names for functions and variables
- PEP 20: Keep code simple and explicit. Prioritize readability over cleverness
- Use type hints in their simplest form (list, dict, | for unions)
- PEP 257: Write clear, imperative docstrings
- Use f-strings. Use structural pattern matching where appropriate
- ALWAYS add "verbose" mode logugu-based logging, & debug-log
- For CLI Python scripts, use fire & rich, and start the script with

```
#!/usr/bin/env -S uv run -s
# /// script
# dependencies = ["PKG1", "PKG2"]
# ///
# this_file: PATH_TO_CURRENT_FILE
```

Ask before extending/refactoring existing code in a way that may add complexity or break things.

When you’re finished, print "Wait, but" to go back, think & reflect, revise & improvement what you’ve done (but don’t invent functionality freely). Repeat this. But stick to the goal of "minimal viable next version". Lead two experts: "Ideot" for creative, unorthodox ideas, and "Critin" to critique flawed thinking and moderate for balanced discussions. The three of you shall illuminate knowledge with concise, beautiful responses, process methodically for clear answers, collaborate step-by-step, sharing thoughts and adapting. If errors are found, step back and focus on accuracy and progress.

## After Python changes run:

```
fd -e py -x autoflake {}; fd -e py -x pyupgrade --py312-plus {}; fd -e py -x ruff check --output-format=github --fix --unsafe-fixes {}; fd -e py -x ruff format --respect-gitignore --target-version py312 {}; python -m pytest;
```

Be creative, diligent, critical, relentless & funny!
</guidelines>


# `vidkompy`

**Intelligent Video Overlay and Synchronization**

`vidkompy` is a powerful command-line tool engineered to overlay a foreground video onto a background video with exceptional precision and automatic alignment. The system intelligently handles discrepancies in resolution, frame rate, duration, and audio, prioritizing content integrity and synchronization accuracy over raw processing speed.

The core philosophy of `vidkompy` is to treat the **foreground video as the definitive source of quality and timing**. All its frames are preserved without modification or re-timing. The background video is dynamically adapted—stretched, retimed, and selectively sampled—to synchronize perfectly with every frame of the foreground content, ensuring a seamless and coherent final output.

---

## Features

- **Automatic Spatial Alignment**: Intelligently detects the optimal x/y offset to position the foreground video within the background, even if they are cropped differently.
- **Advanced Temporal Synchronization**: Aligns videos with different start times, durations, and frame rates, eliminating temporal drift and ensuring content matches perfectly over time.
- **Foreground-First Principle**: Guarantees that every frame of the foreground video is included in the output, preserving its original timing and quality. The background video is adapted to match the foreground.
- **Drift-Free Alignment**: Utilizes Dynamic Time Warping (DTW) to create a globally optimal, monotonic alignment, preventing the common "drift-and-catchup" artifacts seen with simpler methods.
- **High-Performance Processing**: Leverages multi-core processing, perceptual hashing, and optimized video I/O to deliver results quickly.
- Frame fingerprinting is 100-1000x faster than traditional pixel-wise comparison.
- Sequential video composition is 10-100x faster than random-access methods.
- **Smart Audio Handling**: Automatically uses the foreground audio track if available, falling back to the background audio. The audio is correctly synchronized with the final video.
- **Flexible Operation Modes**: Supports specialized modes like `border` matching for aligning content based on visible background edges, and `smooth` blending for seamless visual integration.

## How It Works

The `vidkompy` pipeline is a multi-stage process designed for precision and accuracy:

1.  **Video Analysis**: The tool begins by probing both background (BG) and foreground (FG) videos using `ffprobe` to extract essential metadata: resolution, frames per second (FPS), duration, frame count, and audio stream information.

2.  **Spatial Alignment**: To determine _where_ to place the foreground on the background, `vidkompy` extracts a sample frame from the middle of each video (where content is most likely to be stable). It then calculates the optimal (x, y) offset.

3.  **Temporal Alignment**: This is the core of `vidkompy`. To determine _when_ to start the overlay and how to map frames over time, the tool generates "fingerprints" of frames from both videos and uses Dynamic Time Warping (DTW) to find the best alignment path. This ensures every foreground frame is matched to the most suitable background frame.

4.  **Video Composition**: Once the spatial and temporal alignments are known, `vidkompy` composes the final video. It reads both video streams sequentially (for maximum performance) and, for each foreground frame, fetches the corresponding background frame as determined by the alignment map. The foreground is then overlaid at the correct spatial position.

5.  **Audio Integration**: After the silent video is composed, `vidkompy` adds the appropriate audio track (preferring the foreground's audio) with the correct offset to ensure it's perfectly synchronized with the video content.

## The Algorithms

`vidkompy` employs several sophisticated algorithms to achieve its high-precision results.

### Frame Fingerprinting (Perceptual Hashing)

**TLDR:** Instead of comparing the millions of pixels in a frame, `vidkompy` creates a tiny, unique "fingerprint" (a hash) for each frame. Comparing these small fingerprints is thousands of times faster and smart enough to ignore minor changes from video compression.

---

The `FrameFingerprinter` module is designed for ultra-fast and robust frame comparison. It uses perceptual hashing, which generates a compact representation of a frame's visual structure.

The process works as follows:

1.  **Standardization**: The input frame is resized to a small, standard size (e.g., 64x64 pixels) and converted to grayscale. This ensures consistency and focuses on structural information over color.
2.  **Multi-Algorithm Hashing**: To improve robustness, `vidkompy` computes several types of perceptual hashes for each frame, as different algorithms are sensitive to different visual features:
- `pHash` (Perceptual Hash): Analyzes the frequency domain (using DCT), making it robust to changes in brightness, contrast, and gamma correction.
- `AverageHash`: Computes a hash based on the average color of the frame.
- `ColorMomentHash`: Captures the color distribution statistics of the frame.
- `MarrHildrethHash`: Detects edges and shapes, making it sensitive to structural features.
3.  **Combined Fingerprint**: The results from these hashers, along with a color histogram, are combined into a single "fingerprint" dictionary for the frame.
4.  **Comparison**: To compare two frames, their fingerprints are compared. The similarity is calculated using a weighted average of the normalized Hamming distance between their hashes and the correlation between their histograms. The weights are tuned based on the reliability of each hash type for video content. This entire process is parallelized across multiple CPU cores for maximum speed.

### Spatial Alignment (Template Matching)

**TLDR:** To find the correct position for the foreground video, the tool takes a screenshot from the middle of it and searches for that exact image within a screenshot from the background video.

---

Spatial alignment determines the `(x, y)` coordinates at which to overlay the foreground frame onto the background. `vidkompy` uses a highly accurate and efficient method based on template matching.

1.  **Frame Selection**: A single frame is extracted from the temporal midpoint of both the foreground and background videos. This is done to get a representative frame, avoiding potential opening/closing titles or black frames.
2.  **Grayscale Conversion**: The frames are converted to grayscale. This speeds up the matching process by 3x and makes the alignment more robust to minor color variations between the videos.
3.  **Template Matching**: The core of the alignment is `cv2.matchTemplate` using the `TM_CCOEFF_NORMED` method. This function effectively "slides" the smaller foreground frame image across the larger background frame image and calculates a normalized cross-correlation score at each position.
4.  **Locating the Best Match**: The position with the highest correlation score (from `cv2.minMaxLoc`) is considered the best match. This location `(x_offset, y_offset)` represents the top-left corner where the foreground should be placed. The confidence of this match is the correlation score itself, which typically approaches `1.0` for a perfect match.
5.  **Scaling**: The system checks if the foreground video is larger than the background. If so, it is scaled down to fit, and the scale factor is recorded.

### Temporal Alignment Engines

**TLDR:** `vidkompy` offers two temporal alignment engines: **Fast** for quick processing with good results, and **Precise** for maximum accuracy with advanced drift correction. Both find the optimal "path" through time that perfectly syncs the foreground to the background.

---

Temporal alignment is the most critical and complex part of `vidkompy`. The goal is to create a mapping `FrameAlignment(fg_frame_idx, bg_frame_idx)` for every single foreground frame. `vidkompy` provides two distinct engines for this task:

#### Fast Engine (Default)

The **Fast Engine** uses **Dynamic Time Warping (DTW)** with perceptual hashing for efficient alignment:

1.  **Frame Sampling & Fingerprinting**: The tool samples frames sparsely based on the `max_keyframes` parameter and computes their perceptual fingerprints using multiple hash algorithms (pHash, AverageHash, ColorMomentHash, MarrHildrethHash).
2.  **Cost Matrix Construction**: A cost matrix is built where `cost(i, j)` is the "distance" (i.e., `1.0 - similarity`) between the fingerprint of foreground frame `i` and background frame `j`.
3.  **DTW with Constraints**: The DTW algorithm finds the lowest-cost path through this matrix with:
   - **Monotonicity**: The path can only move forward in time, preventing temporal jumps
   - **Sakoe-Chiba Band**: Constrains the search to a window around the diagonal (reduces complexity from O(N²) to O(N×w))
4.  **Direct Mapping Mode**: With `max_keyframes=1` (default in fast mode), the engine forces direct frame mapping to eliminate drift entirely.
5.  **Interpolation**: For sparse sampling, the engine linearly interpolates between matched keyframes to create a complete alignment map.

**Characteristics:**
- Processing time: ~15 seconds for an 8-second video
- Minimal drift with direct mapping mode
- Suitable for most use cases

#### Precise Engine (Advanced)

The **Precise Engine** implements a sophisticated multi-resolution approach for maximum accuracy:

1.  **Multi-Resolution Hierarchical Alignment**:
   - Creates temporal pyramids at multiple resolutions (1/16, 1/8, 1/4, 1/2, full)
   - Performs coarse-to-fine alignment, starting at the lowest resolution
   - Each level refines the previous level's mapping
   - Applies drift correction every 100 frames

2.  **Keyframe Detection and Anchoring**:
   - Automatically detects keyframes based on temporal changes using Gaussian filtering
   - Aligns keyframes between videos as anchor points
   - Forces alignment at keyframes to prevent long-range drift
   - Detects scene changes and content transitions

3.  **Bidirectional DTW**:
   - Runs DTW in both forward and backward directions
   - Averages the two alignment paths to reduce systematic bias
   - Provides more robust alignment for videos with varying content

4.  **Sliding Window Refinement**:
   - Refines alignment in 30-frame windows
   - Searches locally for optimal alignment adjustments
   - Applies Gaussian smoothing for smooth transitions
   - Ensures strict monotonicity throughout

5.  **Confidence-Based Weighting**:
   - Computes confidence scores for each alignment
   - Weights multiple alignment methods based on their confidence
   - Combines results for optimal accuracy

**Characteristics:**
- Processing time: ~5 minutes for an 8-second video (includes full frame extraction)
- Virtually eliminates all temporal drift
- Handles complex scenarios with varying frame rates and content changes
- Best for critical applications requiring perfect synchronization

#### Engine Comparison

| Aspect | Fast Engine | Precise Engine |
|--------|-------------|----------------|
| **Algorithm** | Single-pass DTW with perceptual hashing | Multi-resolution hierarchical alignment |
| **Processing Time** | ~2x real-time | ~40x real-time |
| **Drift Handling** | Direct mapping (no drift) or interpolation | Active correction + keyframe anchoring |
| **Frame Extraction** | On-demand during composition | Full extraction before alignment |
| **Memory Usage** | Low (streaming) | High (all frames in memory) |
| **Accuracy** | Good, minimal drift at endpoints | Excellent, no drift throughout |
| **Best For** | Quick processing, standard videos | Critical applications, complex content |

## Usage

### Prerequisites

You must have the **FFmpeg** binary installed on your system and accessible in your system's `PATH`. `vidkompy` depends on it for all video and audio processing tasks.

### Installation

The tool is a Python package. It is recommended to install it from the repository to get the latest version.

```bash
# Clone the repository
git clone https://github.com/twardoch/vidkompy.git
cd vidkompy

# Install using uv (or pip)
uv pip install .
```

### Command-Line Interface (CLI)

The tool is run from the command line, providing paths to the background and foreground videos.

**Basic Examples:**

```bash
# Fast engine with direct mapping (default, no drift)
python -m vidkompy --bg background.mp4 --fg foreground.mp4

# Precise engine for maximum accuracy (slower but perfect sync)
python -m vidkompy --bg background.mp4 --fg foreground.mp4 --engine precise

# Custom output path
python -m vidkompy --bg bg.mp4 --fg fg.mp4 --output result.mp4
```

**CLI Help:**

```
INFO: Showing help with the command '__main__.py -- --help'.

NAME
    __main__.py - Overlay foreground video onto background video with intelligent alignment.

SYNOPSIS
    __main__.py BG FG <flags>

DESCRIPTION
    Overlay foreground video onto background video with intelligent alignment.

POSITIONAL ARGUMENTS
    BG
        Type: str | pathlib.Path
        Background video path
    FG
        Type: str | pathlib.Path
        Foreground video path

FLAGS
    -o, --output=OUTPUT
        Type: Optional[str | pathlib...
        Default: None
        Output video path (auto-generated if not provided)
    -e, --engine=ENGINE
        Type: str
        Default: 'fast'
        Temporal alignment engine - 'fast' (current) or 'precise' (coming soon) (default: 'fast')
    -m, --margin=MARGIN
        Type: int
        Default: 8
        Border thickness for border matching mode (default: 8)
    -s, --smooth=SMOOTH
        Type: bool
        Default: False
        Enable smooth blending at frame edges
    -g, --gpu=GPU
        Type: bool
        Default: False
        Enable GPU acceleration (future feature)
    -v, --verbose=VERBOSE
        Type: bool
        Default: False
        Enable verbose logging

NOTES
    You can also use flags syntax for POSITIONAL ARGUMENTS
```

## Performance

Recent updates have significantly improved `vidkompy`'s performance and accuracy:

### Real-World Performance Comparison

Based on actual benchmarks with an 8-second test video (1920x1080 background, 1920x870 foreground, ~480 frames):

| Engine | Processing Time | Speed Ratio | Drift at 1s | Drift at End | Notes |
|--------|----------------|-------------|-------------|--------------|-------|
| **Fast (default)** | 15.8 seconds | ~2x real-time | Minimal | Minimal | Direct mapping prevents drift |
| **Precise** | 5m 18s | ~40x real-time | Less drift | Minimal | Full frame extraction + multi-resolution |

**Key Performance Insights:**

- **Fast Engine**: Processes at approximately 2x real-time speed. With `max_keyframes=1` (default), it uses direct frame mapping which completely eliminates drift while maintaining fast performance.

- **Precise Engine**: While significantly slower (~40x real-time), it provides superior alignment accuracy, especially for complex videos. Interestingly, it shows less drift at the 1-second mark compared to the fast engine, though both engines perform well at video endpoints.

### Technical Optimizations

- **Drift Elimination**: The fast engine now defaults to `max_keyframes=1`, forcing direct frame-to-frame mapping that eliminates temporal drift entirely.
- **Optimized Compositing**: Sequential frame reading instead of random access yields a **10-100x speedup** in the final composition stage.
- **Parallel Processing**: Frame fingerprinting and cost matrix computation leverage all available CPU cores.
- **Perceptual Hashing**: Frame comparison is **100-1000x faster** than pixel-wise methods while maintaining accuracy.
- **Memory Efficiency**: The fast engine uses streaming processing, while the precise engine trades memory for accuracy by loading all frames.

### Choosing the Right Engine

**Use the Fast Engine (default) when:**
- You need quick results (2x real-time processing)
- The videos are already reasonably synchronized
- Minor imperfections are acceptable
- Processing many videos in batch

**Use the Precise Engine when:**
- Perfect synchronization is critical
- Videos have complex timing variations
- Content quality justifies longer processing time
- Working with professionally edited content

## Development

To contribute to `vidkompy`, set up a development environment using `hatch`.

### Setup

1.  Clone the repository.
2.  Ensure you have `hatch` installed (`pip install hatch`).
3.  The project is managed through `hatch` environments defined in `pyproject.toml`.

### Key Commands

Run these commands from the root of the repository.

- **Run Tests**:

```bash
hatch run test
```

- **Run Tests with Coverage Report**:

```bash
hatch run test-cov
```

- **Run Type Checking**:

```bash
hatch run type-check
```

- **Check Formatting and Linting**:

```bash
hatch run lint
```

- **Automatically Fix Formatting and Linting Issues**:

```bash
hatch run fix
```

## License

This project is licensed under the MIT License. See the [LICENSE](https://www.google.com/search?q=LICENSE) file for details.



START SPECIFICATION:
---
description: Create overview documentation for projects focused on video processing, synchronization, and alignment, particularly when dealing with foreground/background video composition and intelligent temporal matching
globs: *.py,*.md
alwaysApply: false
---


# main-overview

## Development Guidelines

- Only modify code directly relevant to the specific request. Avoid changing unrelated functionality.
- Never replace code with placeholders like `# ... rest of the processing ...`. Always include complete code.
- Break problems into smaller steps. Think through each step separately before implementing.
- Always provide a complete PLAN with REASONING based on evidence from code and logs before making changes.
- Explain your OBSERVATIONS clearly, then provide REASONING to identify the exact issue. Add console logs when needed to gather more information.


The vidkompy system implements intelligent video overlay and synchronization with three core business domains:

## Frame Analysis System (Importance: 95)
- Perceptual hashing engine combines multiple algorithms:
  - Frequency domain analysis (pHash)
  - Brightness patterns (AverageHash) 
  - Color distribution statistics (ColorMomentHash)
  - Edge detection (MarrHildrethHash)
- Weighted fingerprint generation for optimal frame matching
- Border region masking for edge-based alignment

## Temporal Alignment Engine (Importance: 98)
- Dynamic Time Warping (DTW) implementation with:
  - Sakoe-Chiba band constraints
  - Monotonic frame mapping
  - Keyframe interpolation
- Adaptive keyframe density calculation
- Foreground-first preservation principle
- Frame sequence validation

## Spatial Positioning System (Importance: 92)
- Template matching with normalized correlation
- Auto-scaling for resolution mismatches
- Border mode alignment options
- Position confidence scoring
- Center alignment fallback

## Video Composition Rules (Importance: 85)
- Foreground frame preservation guarantee
- Background frame warping/adaptation
- Audio stream selection logic
- Quality-based decision system
- Temporal drift prevention

Core Integration Points:
```
src/vidkompy/
  ├── core/
  │   ├── alignment_engine.py    # Orchestration logic
  │   ├── dtw_aligner.py        # Temporal matching
  │   ├── frame_fingerprint.py  # Frame analysis
  │   └── spatial_alignment.py  # Position detection
```

$END$
END SPECIFICATION
</file>

<file path="src/vidkompy/__main__.py">
#!/usr/bin/env python
# this_file: src/vidkompy/__main__.py

"""Enable running vidkompy as a module with python -m vidkompy."""

import fire
from vidkompy.vidkompy import main


def cli():
    fire.Fire(main)


if __name__ == "__main__":
    cli()
</file>

<file path=".gitignore">
tests/
__pycache__/
__pypackages__/
._*
.cache
.coverage
.coverage.*
.cursorignore
.cursorindexingignore
.dmypy.json
.DS_Store
.eggs/
.env
.hypothesis/
.installed.cfg
.ipynb_checkpoints
.mypy_cache/
.nox/
.pdm-build/
.pdm-python
.pdm.toml
.pybuilder/
.pypirc
.pyre/
.pytest_cache/
.Python
.pytype/
.ropeproject
.ruff_cache/
.sass-cache
.scrapy
.specstory/.what-is-this.md
.Spotlight-V100
.spyderproject
.spyproject
.tox/
.Trashes
.venv
.webassets-cache
*.cover
*.egg
*.egg-info/
*.log
*.manifest
*.mo
*.out
*.pot
*.py,cover
*.py[cod]
*.pyc
*.sage.py
*.so
*.spec
*$py.class
/site
build/
celerybeat-schedule
celerybeat.pid
cover/
coverage.xml
cython_debug/
db.sqlite3
db.sqlite3-journal
Desktop.ini
develop-eggs/
dist/
dmypy.json
docs/_build/
downloads/
eggs/
env.bak/
env/
ENV/
htmlcov/
instance/
ipython_config.py
lib/
lib64/
local_settings.py
MANIFEST
node_modules
nosetests.xml
parts/
pip-delete-this-directory.txt
pip-log.txt
profile_default/
sdist/
share/python-wheels/
target/
Thumbs.db
var/
venv
venv.bak/
venv/
wheels/
</file>

<file path=".cursor/rules/alignment-algorithms.mdc">
---
description: Algorithms for spatial and temporal alignment of video frames during synchronization and overlay processing
globs: src/vidkompy/core/alignment_engine.py,src/vidkompy/core/dtw_aligner.py,src/vidkompy/core/frame_fingerprint.py,src/vidkompy/core/spatial_alignment.py
alwaysApply: false
---


# alignment-algorithms

## Frame Fingerprinting System (Importance: 95)

Multi-algorithm perceptual hashing combining:
- pHash for frequency domain analysis
- AverageHash for mean color patterns 
- ColorMomentHash for color distribution stats
- MarrHildrethHash for edge/shape detection

Weighted similarity scoring prioritizes structural matches over color variations.

File: src/vidkompy/core/frame_fingerprint.py

## Temporal Alignment Engine (Importance: 98)

Dynamic Time Warping (DTW) implementation with:
- Sakoe-Chiba band constraint to prevent extreme warping
- Adaptive keyframe density based on FPS differences
- Monotonic frame mapping enforcement
- Linear interpolation between key matches
- Target of 200 keyframes for drift prevention

File: src/vidkompy/core/dtw_aligner.py

## Spatial Positioning System (Importance: 92)

Template matching with:
- Mid-point frame sampling strategy
- Normalized cross-correlation scoring
- Dynamic scale factor handling
- Border-based matching mode
- Edge blending support

File: src/vidkompy/core/spatial_alignment.py

## Core Alignment Rules (Importance: 96)

Video compositing policies:
- Foreground frame preservation guarantee
- Background warping to match foreground timing
- Intelligent audio source selection
- Frame sequence validation
- Progressive frame processing

File: src/vidkompy/core/alignment_engine.py

$END$
</file>

<file path=".cursor/rules/frame-matching-models.mdc">
---
description: Frame matching models and algorithms for video synchronization and overlay alignment
globs: src/vidkompy/core/frame_fingerprint.py,src/vidkompy/core/dtw_aligner.py,src/vidkompy/core/spatial_alignment.py
alwaysApply: false
---


# frame-matching-models

## Frame Fingerprinting Models

Implements multi-algorithm perceptual hashing to create unique frame signatures:

1. Core Hash Combination
- pHash: DCT-based frequency domain analysis
- AverageHash: Mean pixel intensity patterns 
- ColorMomentHash: Statistical color distribution
- MarrHildrethHash: Edge and contour detection

2. Frame Similarity Model
- Weighted combination of hash distances
- Color histogram correlation scoring
- Border region masking for edge-based matching

## Temporal Alignment Model 

DTW-based frame mapping system with constraints:

1. Frame Sequence Mapping
- Monotonic path finding through similarity matrix
- Sakoe-Chiba band limiting maximum temporal offset
- Keyframe interpolation for continuous mapping

2. Matching Modes
- Full frame comparison for precise alignment
- Border region analysis for edge-based sync
- Classic keyframe matching fallback

## Spatial Positioning Model

Template matching system for overlay positioning:

1. Position Detection
- Normalized cross-correlation scoring
- Mid-point frame sampling strategy
- Scale factor computation for size mismatches

2. Alignment Confidence
- Match quality scoring
- Position validation checks
- Fallback positioning rules

## Integration Rules

1. Frame Selection Logic
- Foreground frame preservation priority
- Background frame warping/adaptation
- Keyframe density calculation

2. Quality Control
- Alignment confidence thresholds
- Match validation rules
- Fallback behavior triggers

3. Mode Selection
- Border vs full-frame analysis
- Temporal sync method determination
- Spatial alignment strategy choice

Importance Scores:
- Frame Fingerprinting Models: 95 (Core matching capability)
- Temporal Alignment Model: 90 (Critical sync logic)
- Spatial Positioning Model: 85 (Key overlay functionality)
- Integration Rules: 75 (Important workflow control)

$END$
</file>

<file path=".cursor/rules/video-composition-rules.mdc">
---
description: Rules and constraints for video composition, synchronization, and quality control in vidkompy
globs: src/vidkompy/core/alignment_engine.py,src/vidkompy/core/dtw_aligner.py,src/vidkompy/core/video_processor.py
alwaysApply: false
---


# video-composition-rules

## Core Composition Rules

### Video Quality Preservation
- Foreground video frames must never be modified or dropped
- Background video can be warped/adapted to match foreground
- Output resolution matches foreground when possible
- Foreground timing dictates final frame sequencing
- Background content stretched/compressed to maintain sync

### Audio Integration 
- Foreground audio track takes precedence if available
- Background audio used only when foreground lacks audio
- Audio synchronization must match video frame alignment
- No audio resampling/quality reduction allowed

### Frame Synchronization
- Every foreground frame requires matching background frame
- Frame matching based on perceptual similarity scoring
- Temporal monotonicity must be preserved
- Background frames interpolated for gaps
- Border regions given special consideration in matching

### Quality Thresholds
- Minimum similarity score: 0.75 for frame matches
- Maximum temporal offset: 0.5 seconds
- Required spatial alignment confidence: 0.85
- Minimum keyframe density: 200 frames
- Maximum frame interpolation gap: 5 frames

### Border Mode Rules
- Edge region detection required
- Border thickness configurable (default 8px)
- Smooth blending at transitions
- Special perceptual hashing for borders
- Edge alignment takes priority over content

### Processing Constraints
- Sequential frame access required
- No random frame access allowed
- Foreground frame order preserved
- Background frame order flexible
- Single-pass composition only

$END$
</file>

<file path=".cursor/rules/video-processing-flow.mdc">
---
description: Comprehensive guide for video ingestion, processing, alignment and output generation workflow
globs: 
alwaysApply: false
---


# video-processing-flow

## Core Processing Pipeline

### Video Ingestion
- Frame extraction and fingerprinting using multi-algorithm perceptual hashing
- Parallel processing of frame batches for fingerprint generation
- Adaptive keyframe sampling based on video characteristics
- Importance Score: 85

### Frame Analysis 
- Multi-stage fingerprint generation combining:
  - Frequency domain analysis (pHash)
  - Average color patterns
  - Color distribution moments
  - Edge detection via Marr-Hildreth 
- Border region masking for edge-based alignment
- Importance Score: 90

### Temporal Alignment
- DTW-based frame matching with Sakoe-Chiba band constraint
- Adaptive keyframe density calculation
- Monotonic frame mapping with interpolation
- Temporal locality enforcement
- Importance Score: 95

### Spatial Positioning
- Template matching using normalized cross-correlation
- Dynamic scale factor computation
- Edge-based positioning for border mode
- Automatic offset calculation
- Importance Score: 85

### Composition Engine
- Sequential frame compositing
- Foreground frame preservation
- Dynamic background frame warping
- Audio stream selection and sync
- Importance Score: 80

### Output Generation
- Progressive frame writing
- Quality-preserving encoding
- Audio-video synchronization
- Metadata retention
- Importance Score: 75

Key Files:
```
src/vidkompy/core/
  ├── alignment_engine.py     # Main processing orchestrator
  ├── dtw_aligner.py         # Temporal alignment 
  ├── frame_fingerprint.py   # Frame analysis
  ├── spatial_alignment.py   # Position calculation
  └── video_processor.py     # I/O and composition
```

$END$
</file>

<file path="src/vidkompy/core/spatial_alignment.py">
#!/usr/bin/env python3
# this_file: src/vidkompy/core/spatial_alignment.py

"""
Spatial alignment module for finding optimal overlay positions.

Implements template matching for precise spatial alignment.
"""

import cv2
import numpy as np
from loguru import logger

from vidkompy.models import SpatialAlignment


class SpatialAligner:
    """Handles spatial alignment of foreground on background frames.

    This module finds the optimal position to place the foreground video
    within the background video frame.

    Why spatial alignment is important:
    - Videos might be cropped differently
    - One video might be a subset/window of the other
    - Automatic alignment avoids manual positioning
    - Ensures content overlap for best visual result
    """

    def align(
        self,
        bg_frame: np.ndarray,
        fg_frame: np.ndarray,
        method: str = "precise",
        skip_alignment: bool = False,
    ) -> SpatialAlignment:
        """Find optimal position for foreground on background.

        Args:
            bg_frame: Background frame
            fg_frame: Foreground frame
            method: Alignment method (ignored, always uses template matching)
            skip_alignment: If True, just center the foreground

        Returns:
            SpatialAlignment with offset and scale
        """
        bg_h, bg_w = bg_frame.shape[:2]
        fg_h, fg_w = fg_frame.shape[:2]

        # Check if scaling needed
        scale_factor = 1.0
        if fg_w > bg_w or fg_h > bg_h:
            scale_factor = min(bg_w / fg_w, bg_h / fg_h)
            logger.warning(
                f"Foreground larger than background, scaling by {scale_factor:.3f}"
            )

            # Scale foreground
            new_w = int(fg_w * scale_factor)
            new_h = int(fg_h * scale_factor)
            fg_frame = cv2.resize(
                fg_frame, (new_w, new_h), interpolation=cv2.INTER_AREA
            )
            fg_h, fg_w = new_h, new_w

        if skip_alignment:
            # Center alignment
            x_offset = (bg_w - fg_w) // 2
            y_offset = (bg_h - fg_h) // 2
            return SpatialAlignment(
                x_offset=x_offset,
                y_offset=y_offset,
                scale_factor=scale_factor,
                confidence=1.0,
            )

        # Always use template matching
        return self._template_matching(bg_frame, fg_frame, scale_factor)

    def _template_matching(
        self, bg_frame: np.ndarray, fg_frame: np.ndarray, scale_factor: float
    ) -> SpatialAlignment:
        """Find best position using template matching.

        Uses normalized cross-correlation to find the best match.

        Why template matching:
        - Works perfectly when FG is an exact subset of BG
        - Very fast for exact matches
        - High confidence scores for good matches
        - OpenCV implementation is highly optimized

        Why we use grayscale:
        - 3x faster than color matching
        - More robust to color shifts
        - Structural alignment matters more than color

        Limitations:
        - Fails if videos have different compression/artifacts
        - Sensitive to brightness/contrast changes
        - Requires FG to be subset of BG
        """
        logger.debug("Using template matching for spatial alignment")

        # Convert to grayscale
        bg_gray = cv2.cvtColor(bg_frame, cv2.COLOR_BGR2GRAY)
        fg_gray = cv2.cvtColor(fg_frame, cv2.COLOR_BGR2GRAY)

        # Apply template matching
        result = cv2.matchTemplate(bg_gray, fg_gray, cv2.TM_CCOEFF_NORMED)

        # Find best match
        min_val, max_val, min_loc, max_loc = cv2.minMaxLoc(result)

        # Top-left corner of match
        x_offset, y_offset = max_loc

        logger.info(
            f"Template match found at ({x_offset}, {y_offset}) "
            f"with confidence {max_val:.3f}"
        )

        return SpatialAlignment(
            x_offset=x_offset,
            y_offset=y_offset,
            scale_factor=scale_factor,
            confidence=float(max_val),
        )
</file>

<file path="benchmark.sh">
#!/usr/bin/env bash
# Test all engines including the new tunnel engines

# for engine in tunnel_mask tunnel_full mask precise fast; do
for engine in tunnel_mask tunnel_full; do
    # For tunnel engines, only window parameter matters
    if [[ $engine == tunnel_* ]]; then
        for window in 30 60 100; do
            base="q-${engine}-w${window}"
            echo ${base}
            time python -m vidkompy -b tests/bg.mp4 -f tests/fg.mp4 -e ${engine} -w ${window} -o tests/${base}.mp4 --verbose >tests/${base}.txt 2>tests/${base}.err.txt
        done
    else
        # For other engines, test drift and window combinations
        for drift in 8 100; do
            for window in 8 100; do
                base="q-${engine}-d${drift}-w${window}"
                echo ${base}
                time python -m vidkompy -b tests/bg.mp4 -f tests/fg.mp4 -e ${engine} -d ${drift} -w ${window} -o tests/${base}.mp4 --verbose >tests/${base}.txt 2>tests/${base}.err.txt
            done
        done
    fi
done
</file>

<file path="CLAUDE.md">
## Coding style

<guidelines>
# When you write code

- Iterate gradually, avoiding major changes
- Minimize confirmations and checks
- Preserve existing code/structure unless necessary
- Use constants over magic numbers
- Check for existing solutions in the codebase before starting
- Check often the coherence of the code you’re writing with the rest of the code.
- Focus on minimal viable increments and ship early
- Write explanatory docstrings/comments that explain what and WHY this does, explain where and how the code is used/referred to elsewhere in the code
- Analyze code line-by-line
- Handle failures gracefully with retries, fallbacks, user guidance
- Address edge cases, validate assumptions, catch errors early
- Let the computer do the work, minimize user decisions
- Reduce cognitive load, beautify code
- Modularize repeated logic into concise, single-purpose functions
- Favor flat over nested structures
- Consistently keep, document, update and consult the holistic overview mental image of the codebase. 

Work in rounds: 

- Create `PROGRESS.md` as a detailed flat plan with `[ ]` items. 
- Identify the most important TODO items, and create `TODO.md` with `[ ]` items. 
- Implement the changes. 
- Update `PROGRESS.md` and `TODO.md` as you go. 
- After each round of changes, update `CHANGELOG.md` with the changes.
- Update `README.md` to reflect the changes.

## Keep track of paths

In each source file, maintain the up-to-date `this_file` record that shows the path of the current file relative to project root. Place the `this_file` record near the top of the file, as a comment after the shebangs, or in the YAML Markdown frontmatter.

## When you write Python

- Use `uv pip`, never `pip`
- Use `python -m` when running code
- PEP 8: Use consistent formatting and naming
- Write clear, descriptive names for functions and variables
- PEP 20: Keep code simple and explicit. Prioritize readability over cleverness
- Use type hints in their simplest form (list, dict, | for unions)
- PEP 257: Write clear, imperative docstrings
- Use f-strings. Use structural pattern matching where appropriate
- ALWAYS add "verbose" mode logugu-based logging, & debug-log
- For CLI Python scripts, use fire & rich, and start the script with

```
#!/usr/bin/env -S uv run -s
# /// script
# dependencies = ["PKG1", "PKG2"]
# ///
# this_file: PATH_TO_CURRENT_FILE
```

Ask before extending/refactoring existing code in a way that may add complexity or break things.

When you’re finished, print "Wait, but" to go back, think & reflect, revise & improvement what you’ve done (but don’t invent functionality freely). Repeat this. But stick to the goal of "minimal viable next version". Lead two experts: "Ideot" for creative, unorthodox ideas, and "Critin" to critique flawed thinking and moderate for balanced discussions. The three of you shall illuminate knowledge with concise, beautiful responses, process methodically for clear answers, collaborate step-by-step, sharing thoughts and adapting. If errors are found, step back and focus on accuracy and progress.

## After Python changes run:

```
fd -e py -x autoflake {}; fd -e py -x pyupgrade --py312-plus {}; fd -e py -x ruff check --output-format=github --fix --unsafe-fixes {}; fd -e py -x ruff format --respect-gitignore --target-version py312 {}; python -m pytest;
```

Be creative, diligent, critical, relentless & funny!
</guidelines>


# `vidkompy`

**Intelligent Video Overlay and Synchronization**

`vidkompy` is a powerful command-line tool engineered to overlay a foreground video onto a background video with exceptional precision and automatic alignment. The system intelligently handles discrepancies in resolution, frame rate, duration, and audio, prioritizing content integrity and synchronization accuracy over raw processing speed.

The core philosophy of `vidkompy` is to treat the **foreground video as the definitive source of quality and timing**. All its frames are preserved without modification or re-timing. The background video is dynamically adapted—stretched, retimed, and selectively sampled—to synchronize perfectly with every frame of the foreground content, ensuring a seamless and coherent final output.

---

## Features

- **Automatic Spatial Alignment**: Intelligently detects the optimal x/y offset to position the foreground video within the background, even if they are cropped differently.
- **Advanced Temporal Synchronization**: Aligns videos with different start times, durations, and frame rates, eliminating temporal drift and ensuring content matches perfectly over time.
- **Foreground-First Principle**: Guarantees that every frame of the foreground video is included in the output, preserving its original timing and quality. The background video is adapted to match the foreground.
- **Drift-Free Alignment**: Utilizes Dynamic Time Warping (DTW) to create a globally optimal, monotonic alignment, preventing the common "drift-and-catchup" artifacts seen with simpler methods.
- **High-Performance Processing**: Leverages multi-core processing, perceptual hashing, and optimized video I/O to deliver results quickly.
- Frame fingerprinting is 100-1000x faster than traditional pixel-wise comparison.
- Sequential video composition is 10-100x faster than random-access methods.
- **Smart Audio Handling**: Automatically uses the foreground audio track if available, falling back to the background audio. The audio is correctly synchronized with the final video.
- **Flexible Operation Modes**: Supports specialized modes like `border` matching for aligning content based on visible background edges, and `smooth` blending for seamless visual integration.

## How It Works

The `vidkompy` pipeline is a multi-stage process designed for precision and accuracy:

1.  **Video Analysis**: The tool begins by probing both background (BG) and foreground (FG) videos using `ffprobe` to extract essential metadata: resolution, frames per second (FPS), duration, frame count, and audio stream information.

2.  **Spatial Alignment**: To determine _where_ to place the foreground on the background, `vidkompy` extracts a sample frame from the middle of each video (where content is most likely to be stable). It then calculates the optimal (x, y) offset.

3.  **Temporal Alignment**: This is the core of `vidkompy`. To determine _when_ to start the overlay and how to map frames over time, the tool generates "fingerprints" of frames from both videos and uses Dynamic Time Warping (DTW) to find the best alignment path. This ensures every foreground frame is matched to the most suitable background frame.

4.  **Video Composition**: Once the spatial and temporal alignments are known, `vidkompy` composes the final video. It reads both video streams sequentially (for maximum performance) and, for each foreground frame, fetches the corresponding background frame as determined by the alignment map. The foreground is then overlaid at the correct spatial position.

5.  **Audio Integration**: After the silent video is composed, `vidkompy` adds the appropriate audio track (preferring the foreground's audio) with the correct offset to ensure it's perfectly synchronized with the video content.

## The Algorithms

`vidkompy` employs several sophisticated algorithms to achieve its high-precision results.

### Frame Fingerprinting (Perceptual Hashing)

**TLDR:** Instead of comparing the millions of pixels in a frame, `vidkompy` creates a tiny, unique "fingerprint" (a hash) for each frame. Comparing these small fingerprints is thousands of times faster and smart enough to ignore minor changes from video compression.

---

The `FrameFingerprinter` module is designed for ultra-fast and robust frame comparison. It uses perceptual hashing, which generates a compact representation of a frame's visual structure.

The process works as follows:

1.  **Standardization**: The input frame is resized to a small, standard size (e.g., 64x64 pixels) and converted to grayscale. This ensures consistency and focuses on structural information over color.
2.  **Multi-Algorithm Hashing**: To improve robustness, `vidkompy` computes several types of perceptual hashes for each frame, as different algorithms are sensitive to different visual features:
- `pHash` (Perceptual Hash): Analyzes the frequency domain (using DCT), making it robust to changes in brightness, contrast, and gamma correction.
- `AverageHash`: Computes a hash based on the average color of the frame.
- `ColorMomentHash`: Captures the color distribution statistics of the frame.
- `MarrHildrethHash`: Detects edges and shapes, making it sensitive to structural features.
3.  **Combined Fingerprint**: The results from these hashers, along with a color histogram, are combined into a single "fingerprint" dictionary for the frame.
4.  **Comparison**: To compare two frames, their fingerprints are compared. The similarity is calculated using a weighted average of the normalized Hamming distance between their hashes and the correlation between their histograms. The weights are tuned based on the reliability of each hash type for video content. This entire process is parallelized across multiple CPU cores for maximum speed.

### Spatial Alignment (Template Matching)

**TLDR:** To find the correct position for the foreground video, the tool takes a screenshot from the middle of it and searches for that exact image within a screenshot from the background video.

---

Spatial alignment determines the `(x, y)` coordinates at which to overlay the foreground frame onto the background. `vidkompy` uses a highly accurate and efficient method based on template matching.

1.  **Frame Selection**: A single frame is extracted from the temporal midpoint of both the foreground and background videos. This is done to get a representative frame, avoiding potential opening/closing titles or black frames.
2.  **Grayscale Conversion**: The frames are converted to grayscale. This speeds up the matching process by 3x and makes the alignment more robust to minor color variations between the videos.
3.  **Template Matching**: The core of the alignment is `cv2.matchTemplate` using the `TM_CCOEFF_NORMED` method. This function effectively "slides" the smaller foreground frame image across the larger background frame image and calculates a normalized cross-correlation score at each position.
4.  **Locating the Best Match**: The position with the highest correlation score (from `cv2.minMaxLoc`) is considered the best match. This location `(x_offset, y_offset)` represents the top-left corner where the foreground should be placed. The confidence of this match is the correlation score itself, which typically approaches `1.0` for a perfect match.
5.  **Scaling**: The system checks if the foreground video is larger than the background. If so, it is scaled down to fit, and the scale factor is recorded.

### Temporal Alignment Engines

**TLDR:** `vidkompy` offers two temporal alignment engines: **Fast** for quick processing with good results, and **Precise** for maximum accuracy with advanced drift correction. Both find the optimal "path" through time that perfectly syncs the foreground to the background.

---

Temporal alignment is the most critical and complex part of `vidkompy`. The goal is to create a mapping `FrameAlignment(fg_frame_idx, bg_frame_idx)` for every single foreground frame. `vidkompy` provides two distinct engines for this task:

#### Fast Engine (Default)

The **Fast Engine** uses **Dynamic Time Warping (DTW)** with perceptual hashing for efficient alignment:

1.  **Frame Sampling & Fingerprinting**: The tool samples frames sparsely based on the `max_keyframes` parameter and computes their perceptual fingerprints using multiple hash algorithms (pHash, AverageHash, ColorMomentHash, MarrHildrethHash).
2.  **Cost Matrix Construction**: A cost matrix is built where `cost(i, j)` is the "distance" (i.e., `1.0 - similarity`) between the fingerprint of foreground frame `i` and background frame `j`.
3.  **DTW with Constraints**: The DTW algorithm finds the lowest-cost path through this matrix with:
   - **Monotonicity**: The path can only move forward in time, preventing temporal jumps
   - **Sakoe-Chiba Band**: Constrains the search to a window around the diagonal (reduces complexity from O(N²) to O(N×w))
4.  **Direct Mapping Mode**: With `max_keyframes=1` (default in fast mode), the engine forces direct frame mapping to eliminate drift entirely.
5.  **Interpolation**: For sparse sampling, the engine linearly interpolates between matched keyframes to create a complete alignment map.

**Characteristics:**
- Processing time: ~15 seconds for an 8-second video
- Minimal drift with direct mapping mode
- Suitable for most use cases

#### Precise Engine (Advanced)

The **Precise Engine** implements a sophisticated multi-resolution approach for maximum accuracy:

1.  **Multi-Resolution Hierarchical Alignment**:
   - Creates temporal pyramids at multiple resolutions (1/16, 1/8, 1/4, 1/2, full)
   - Performs coarse-to-fine alignment, starting at the lowest resolution
   - Each level refines the previous level's mapping
   - Applies drift correction every 100 frames

2.  **Keyframe Detection and Anchoring**:
   - Automatically detects keyframes based on temporal changes using Gaussian filtering
   - Aligns keyframes between videos as anchor points
   - Forces alignment at keyframes to prevent long-range drift
   - Detects scene changes and content transitions

3.  **Bidirectional DTW**:
   - Runs DTW in both forward and backward directions
   - Averages the two alignment paths to reduce systematic bias
   - Provides more robust alignment for videos with varying content

4.  **Sliding Window Refinement**:
   - Refines alignment in 30-frame windows
   - Searches locally for optimal alignment adjustments
   - Applies Gaussian smoothing for smooth transitions
   - Ensures strict monotonicity throughout

5.  **Confidence-Based Weighting**:
   - Computes confidence scores for each alignment
   - Weights multiple alignment methods based on their confidence
   - Combines results for optimal accuracy

**Characteristics:**
- Processing time: ~5 minutes for an 8-second video (includes full frame extraction)
- Virtually eliminates all temporal drift
- Handles complex scenarios with varying frame rates and content changes
- Best for critical applications requiring perfect synchronization

#### Engine Comparison

| Aspect | Fast Engine | Precise Engine |
|--------|-------------|----------------|
| **Algorithm** | Single-pass DTW with perceptual hashing | Multi-resolution hierarchical alignment |
| **Processing Time** | ~2x real-time | ~40x real-time |
| **Drift Handling** | Direct mapping (no drift) or interpolation | Active correction + keyframe anchoring |
| **Frame Extraction** | On-demand during composition | Full extraction before alignment |
| **Memory Usage** | Low (streaming) | High (all frames in memory) |
| **Accuracy** | Good, minimal drift at endpoints | Excellent, no drift throughout |
| **Best For** | Quick processing, standard videos | Critical applications, complex content |

## Usage

### Prerequisites

You must have the **FFmpeg** binary installed on your system and accessible in your system's `PATH`. `vidkompy` depends on it for all video and audio processing tasks.

### Installation

The tool is a Python package. It is recommended to install it from the repository to get the latest version.

```bash
# Clone the repository
git clone https://github.com/twardoch/vidkompy.git
cd vidkompy

# Install using uv (or pip)
uv pip install .
```

### Command-Line Interface (CLI)

The tool is run from the command line, providing paths to the background and foreground videos.

**Basic Examples:**

```bash
# Fast engine with direct mapping (default, no drift)
python -m vidkompy --bg background.mp4 --fg foreground.mp4

# Precise engine for maximum accuracy (slower but perfect sync)
python -m vidkompy --bg background.mp4 --fg foreground.mp4 --engine precise

# Custom output path
python -m vidkompy --bg bg.mp4 --fg fg.mp4 --output result.mp4
```

**CLI Help:**

```
INFO: Showing help with the command '__main__.py -- --help'.

NAME
    __main__.py - Overlay foreground video onto background video with intelligent alignment.

SYNOPSIS
    __main__.py BG FG <flags>

DESCRIPTION
    Overlay foreground video onto background video with intelligent alignment.

POSITIONAL ARGUMENTS
    BG
        Type: str | pathlib.Path
        Background video path
    FG
        Type: str | pathlib.Path
        Foreground video path

FLAGS
    -o, --output=OUTPUT
        Type: Optional[str | pathlib...
        Default: None
        Output video path (auto-generated if not provided)
    -e, --engine=ENGINE
        Type: str
        Default: 'fast'
        Temporal alignment engine - 'fast' (current) or 'precise' (coming soon) (default: 'fast')
    -m, --margin=MARGIN
        Type: int
        Default: 8
        Border thickness for border matching mode (default: 8)
    -s, --smooth=SMOOTH
        Type: bool
        Default: False
        Enable smooth blending at frame edges
    -g, --gpu=GPU
        Type: bool
        Default: False
        Enable GPU acceleration (future feature)
    -v, --verbose=VERBOSE
        Type: bool
        Default: False
        Enable verbose logging

NOTES
    You can also use flags syntax for POSITIONAL ARGUMENTS
```

## Performance

Recent updates have significantly improved `vidkompy`'s performance and accuracy:

### Real-World Performance Comparison

Based on actual benchmarks with an 8-second test video (1920x1080 background, 1920x870 foreground, ~480 frames):

| Engine | Processing Time | Speed Ratio | Drift at 1s | Drift at End | Notes |
|--------|----------------|-------------|-------------|--------------|-------|
| **Fast (default)** | 15.8 seconds | ~2x real-time | Minimal | Minimal | Direct mapping prevents drift |
| **Precise** | 5m 18s | ~40x real-time | Less drift | Minimal | Full frame extraction + multi-resolution |

**Key Performance Insights:**

- **Fast Engine**: Processes at approximately 2x real-time speed. With `max_keyframes=1` (default), it uses direct frame mapping which completely eliminates drift while maintaining fast performance.

- **Precise Engine**: While significantly slower (~40x real-time), it provides superior alignment accuracy, especially for complex videos. Interestingly, it shows less drift at the 1-second mark compared to the fast engine, though both engines perform well at video endpoints.

### Technical Optimizations

- **Drift Elimination**: The fast engine now defaults to `max_keyframes=1`, forcing direct frame-to-frame mapping that eliminates temporal drift entirely.
- **Optimized Compositing**: Sequential frame reading instead of random access yields a **10-100x speedup** in the final composition stage.
- **Parallel Processing**: Frame fingerprinting and cost matrix computation leverage all available CPU cores.
- **Perceptual Hashing**: Frame comparison is **100-1000x faster** than pixel-wise methods while maintaining accuracy.
- **Memory Efficiency**: The fast engine uses streaming processing, while the precise engine trades memory for accuracy by loading all frames.

### Choosing the Right Engine

**Use the Fast Engine (default) when:**
- You need quick results (2x real-time processing)
- The videos are already reasonably synchronized
- Minor imperfections are acceptable
- Processing many videos in batch

**Use the Precise Engine when:**
- Perfect synchronization is critical
- Videos have complex timing variations
- Content quality justifies longer processing time
- Working with professionally edited content

## Development

To contribute to `vidkompy`, set up a development environment using `hatch`.

### Setup

1.  Clone the repository.
2.  Ensure you have `hatch` installed (`pip install hatch`).
3.  The project is managed through `hatch` environments defined in `pyproject.toml`.

### Key Commands

Run these commands from the root of the repository.

- **Run Tests**:

```bash
hatch run test
```

- **Run Tests with Coverage Report**:

```bash
hatch run test-cov
```

- **Run Type Checking**:

```bash
hatch run type-check
```

- **Check Formatting and Linting**:

```bash
hatch run lint
```

- **Automatically Fix Formatting and Linting Issues**:

```bash
hatch run fix
```

## License

This project is licensed under the MIT License. See the [LICENSE](https://www.google.com/search?q=LICENSE) file for details.



START SPECIFICATION:
---
description: Create overview documentation for projects focused on video processing, synchronization, and alignment, particularly when dealing with foreground/background video composition and intelligent temporal matching
globs: *.py,*.md
alwaysApply: false
---


# main-overview

## Development Guidelines

- Only modify code directly relevant to the specific request. Avoid changing unrelated functionality.
- Never replace code with placeholders like `# ... rest of the processing ...`. Always include complete code.
- Break problems into smaller steps. Think through each step separately before implementing.
- Always provide a complete PLAN with REASONING based on evidence from code and logs before making changes.
- Explain your OBSERVATIONS clearly, then provide REASONING to identify the exact issue. Add console logs when needed to gather more information.


The vidkompy system implements intelligent video overlay and synchronization with three core business domains:

## Frame Analysis System (Importance: 95)
- Perceptual hashing engine combines multiple algorithms:
  - Frequency domain analysis (pHash)
  - Brightness patterns (AverageHash) 
  - Color distribution statistics (ColorMomentHash)
  - Edge detection (MarrHildrethHash)
- Weighted fingerprint generation for optimal frame matching
- Border region masking for edge-based alignment

## Temporal Alignment Engine (Importance: 98)
- Dynamic Time Warping (DTW) implementation with:
  - Sakoe-Chiba band constraints
  - Monotonic frame mapping
  - Keyframe interpolation
- Adaptive keyframe density calculation
- Foreground-first preservation principle
- Frame sequence validation

## Spatial Positioning System (Importance: 92)
- Template matching with normalized correlation
- Auto-scaling for resolution mismatches
- Border mode alignment options
- Position confidence scoring
- Center alignment fallback

## Video Composition Rules (Importance: 85)
- Foreground frame preservation guarantee
- Background frame warping/adaptation
- Audio stream selection logic
- Quality-based decision system
- Temporal drift prevention

Core Integration Points:
```
src/vidkompy/
  ├── core/
  │   ├── alignment_engine.py    # Orchestration logic
  │   ├── dtw_aligner.py        # Temporal matching
  │   ├── frame_fingerprint.py  # Frame analysis
  │   └── spatial_alignment.py  # Position detection
```

$END$
END SPECIFICATION
</file>

<file path=".claude/settings.local.json">
{
  "permissions": {
    "allow": [
      "Bash(fd:*)",
      "Bash(chmod:*)",
      "Bash(python:*)",
      "Bash(ls:*)",
      "Bash(ffprobe:*)",
      "Bash(cp:*)",
      "Bash(rm:*)",
      "Bash(mkdir:*)",
      "Bash(grep:*)",
      "Bash(time python:*)",
      "Bash(find:*)",
      "mcp__search1api__search",
      "mcp__search1api__crawl"
    ],
    "deny": []
  }
}
</file>

<file path="src/vidkompy/core/frame_fingerprint.py">
#!/usr/bin/env python3
# this_file: src/vidkompy/core/frame_fingerprint.py

"""
Fast frame fingerprinting system using perceptual hashing.

This module provides ultra-fast frame comparison capabilities that are
100-1000x faster than SSIM while maintaining good accuracy for similar frames.
"""

import cv2
import numpy as np
from concurrent.futures import ProcessPoolExecutor, as_completed
import multiprocessing as mp
from loguru import logger
import time

try:
    from vidkompy.core.numba_optimizations import (
        compute_hamming_distances_batch,
        compute_histogram_correlation,
        compute_weighted_similarity,
    )
    NUMBA_AVAILABLE = True
except ImportError:
    logger.warning("Numba optimizations not available for fingerprinting")
    NUMBA_AVAILABLE = False


class FrameFingerprinter:
    """Ultra-fast frame comparison using perceptual hashing.

    Why perceptual hashing:
    - 100-1000x faster than SSIM
    - Robust to compression artifacts and minor color/brightness changes
    - Compact representation (64 bits per frame)
    - Works well for finding similar frames

    Why multiple hash algorithms:
    - Different hashes capture different aspects of the image
    - Combining them improves robustness
    - Reduces false positives/negatives
    """

    def __init__(self, log_init: bool = True):
        """Initialize fingerprinter with multiple hash algorithms.

        Args:
            log_init: Whether to log initialization messages
        """
        self.hashers = {}
        self._init_hashers(log_init)

        # Cache for computed fingerprints
        self.fingerprint_cache: dict[str, dict[int, dict[str, np.ndarray]]] = {}
        
        # Flag to control numba usage
        self.use_numba = NUMBA_AVAILABLE

    def _init_hashers(self, log_init: bool = True):
        """Initialize available hash algorithms.

        Why these specific algorithms:
        - PHash: Frequency domain analysis, good for structure
        - AverageHash: Average color, good for brightness
        - ColorMomentHash: Color distribution, good for color changes
        - MarrHildrethHash: Edge detection, good for shapes
        """
        try:
            self.hashers["phash"] = cv2.img_hash.PHash_create()
            if log_init:
                logger.debug("✓ PHash initialized")
        except AttributeError:
            logger.warning("PHash not available")

        try:
            self.hashers["ahash"] = cv2.img_hash.AverageHash_create()
            if log_init:
                logger.debug("✓ AverageHash initialized")
        except AttributeError:
            logger.warning("AverageHash not available")

        try:
            self.hashers["dhash"] = cv2.img_hash.ColorMomentHash_create()
            if log_init:
                logger.debug("✓ ColorMomentHash initialized")
        except AttributeError:
            logger.warning("ColorMomentHash not available")

        try:
            self.hashers["mhash"] = cv2.img_hash.MarrHildrethHash_create()
            if log_init:
                logger.debug("✓ MarrHildrethHash initialized")
        except AttributeError:
            logger.warning("MarrHildrethHash not available")

        if not self.hashers:
            msg = (
                "No perceptual hash algorithms available. "
                "Please install opencv-contrib-python."
            )
            raise RuntimeError(msg)

        if log_init:
            logger.info(f"Initialized {len(self.hashers)} hash algorithms")

    def compute_fingerprint(self, frame: np.ndarray) -> dict[str, np.ndarray]:
        """Compute multi-algorithm fingerprint for a frame.

        Args:
            frame: Input frame as numpy array

        Returns:
            Dictionary of hash algorithm names to hash values

        Why multiple algorithms:
        - Redundancy reduces errors
        - Different algorithms catch different changes
        - Weighted combination improves accuracy
        """
        # Standardize frame size for consistent hashing
        std_size = (64, 64)
        std_frame = cv2.resize(frame, std_size, interpolation=cv2.INTER_AREA)

        # Convert to grayscale for most hashes
        if len(std_frame.shape) == 3:
            gray_frame = cv2.cvtColor(std_frame, cv2.COLOR_BGR2GRAY)
        else:
            gray_frame = std_frame

        fingerprint = {}

        # Compute each available hash
        for name, hasher in self.hashers.items():
            try:
                if name in ["phash", "ahash", "mhash"]:
                    # These work on grayscale
                    hash_value = hasher.compute(gray_frame)
                else:
                    # ColorMomentHash needs color
                    hash_value = hasher.compute(std_frame)

                fingerprint[name] = hash_value
            except Exception as e:
                logger.warning(f"Failed to compute {name}: {e}")

        # Add color histogram as additional feature
        if len(frame.shape) == 3:
            fingerprint["histogram"] = self._compute_color_histogram(std_frame)

        return fingerprint

    def compute_masked_fingerprint(
        self, frame: np.ndarray, mask: np.ndarray
    ) -> dict[str, np.ndarray]:
        """Compute fingerprint for masked region of frame.

        Args:
            frame: Input frame
            mask: Binary mask (1 = include, 0 = exclude)

        Returns:
            Fingerprint dictionary
        """
        # Apply mask to frame
        masked_frame = frame.copy()
        if len(frame.shape) == 3:
            # Apply to all channels
            for c in range(frame.shape[2]):
                masked_frame[:, :, c] = frame[:, :, c] * mask
        else:
            masked_frame = frame * mask

        # Crop to bounding box of mask to focus on relevant region
        rows = np.any(mask, axis=1)
        cols = np.any(mask, axis=0)

        if not np.any(rows) or not np.any(cols):
            # Empty mask, return default fingerprint
            return self.compute_fingerprint(frame)

        rmin, rmax = np.where(rows)[0][[0, -1]]
        cmin, cmax = np.where(cols)[0][[0, -1]]

        cropped = masked_frame[rmin : rmax + 1, cmin : cmax + 1]

        # Compute fingerprint on cropped region
        return self.compute_fingerprint(cropped)

    def _compute_color_histogram(self, frame: np.ndarray) -> np.ndarray:
        """Compute color histogram for additional discrimination.

        Why color histogram:
        - Captures global color distribution
        - Complements structure-based hashes
        - Fast to compute and compare
        """
        # Compute histogram for each channel
        hist_b = cv2.calcHist([frame], [0], None, [32], [0, 256])
        hist_g = cv2.calcHist([frame], [1], None, [32], [0, 256])
        hist_r = cv2.calcHist([frame], [2], None, [32], [0, 256])

        # Concatenate and normalize
        hist = np.concatenate([hist_b, hist_g, hist_r])
        hist = hist.flatten()
        hist = hist / (hist.sum() + 1e-7)  # Normalize

        return hist.astype(np.float32)

    def compare_fingerprints(
        self, fp1: dict[str, np.ndarray], fp2: dict[str, np.ndarray]
    ) -> float:
        """Compare two fingerprints and return similarity score.

        Args:
            fp1: First fingerprint
            fp2: Second fingerprint

        Returns:
            Similarity score between 0 and 1

        Why weighted combination:
        - PHash is most reliable for video frames
        - Other hashes help disambiguate
        - Histogram adds color information
        """
        # Try numba optimization for histogram comparison if available
        if self.use_numba and "histogram" in fp1 and "histogram" in fp2:
            try:
                hist_score = compute_histogram_correlation(fp1["histogram"], fp2["histogram"])
            except Exception:
                # Fallback to OpenCV
                hist_score = cv2.compareHist(
                    fp1["histogram"], fp2["histogram"], cv2.HISTCMP_CORREL
                )
        elif "histogram" in fp1 and "histogram" in fp2:
            hist_score = cv2.compareHist(
                fp1["histogram"], fp2["histogram"], cv2.HISTCMP_CORREL
            )
        else:
            hist_score = 0.0
        
        hist_score = max(0, hist_score)  # Ensure non-negative
        
        # Collect hash distances
        hash_distances = []
        hash_names = []
        
        for name in fp1:
            if name not in fp2 or name == "histogram":
                continue
            
            # Ensure uint8 type for NORM_HAMMING
            h1 = (
                fp1[name].astype(np.uint8)
                if fp1[name].dtype != np.uint8
                else fp1[name]
            )
            h2 = (
                fp2[name].astype(np.uint8)
                if fp2[name].dtype != np.uint8
                else fp2[name]
            )
            distance = cv2.norm(h1, h2, cv2.NORM_HAMMING)
            
            # Normalize to 0-1 distance
            max_bits = h1.shape[0] * 8
            normalized_distance = distance / max_bits
            hash_distances.append(normalized_distance)
            hash_names.append(name)
        
        if not hash_distances and hist_score == 0:
            return 0.0
        
        # Define weights
        weight_map = {
            "phash": 0.4,  # Most reliable
            "ahash": 0.2,  # Good for brightness
            "dhash": 0.2,  # Good for color
            "mhash": 0.1,  # Good for edges
            "histogram": 0.1,  # Global color
        }
        
        # Use numba optimization if available
        if self.use_numba and hash_distances:
            try:
                # Prepare weights array
                weights = np.array([weight_map.get(name, 0.1) for name in hash_names])
                weights = np.append(weights, weight_map.get("histogram", 0.1))
                
                # Convert to numpy arrays
                hash_dist_array = np.array(hash_distances, dtype=np.float64)
                
                return compute_weighted_similarity(hash_dist_array, hist_score, weights)
            except Exception:
                # Fallback to standard implementation
                pass
        
        # Standard implementation
        total_weight = 0
        total_score = 0
        
        # Add hash similarities
        for i, name in enumerate(hash_names):
            weight = weight_map.get(name, 0.1)
            similarity = 1.0 - hash_distances[i]
            total_score += similarity * weight
            total_weight += weight
        
        # Add histogram score
        if hist_score > 0:
            weight = weight_map.get("histogram", 0.1)
            total_score += hist_score * weight
            total_weight += weight
        
        return total_score / total_weight if total_weight > 0 else 0.0

    def precompute_video_fingerprints(
        self,
        video_path: str,
        frame_indices: list[int],
        video_processor,
        resize_factor: float = 0.25,
    ) -> dict[int, dict[str, np.ndarray]]:
        """Precompute fingerprints for all specified frames in parallel.

        Args:
            video_path: Path to video file
            frame_indices: List of frame indices to process
            video_processor: VideoProcessor instance for frame extraction
            resize_factor: Factor to resize frames before hashing

        Returns:
            Dictionary mapping frame indices to fingerprints

        Why parallel processing:
        - Frame extraction is I/O bound (use threads)
        - Hash computation is CPU bound (use processes)
        - Significant speedup on multi-core systems
        """
        # Check cache first
        if video_path in self.fingerprint_cache:
            cached = self.fingerprint_cache[video_path]
            missing = [idx for idx in frame_indices if idx not in cached]
            if not missing:
                return {idx: cached[idx] for idx in frame_indices}
            frame_indices = missing

        logger.info(f"Computing fingerprints for {len(frame_indices)} frames...")
        start_time = time.time()

        # Step 1: Extract frames in batches (I/O bound)
        frames_dict = {}
        batch_size = 50

        for i in range(0, len(frame_indices), batch_size):
            batch_indices = frame_indices[i : i + batch_size]
            batch_frames = video_processor.extract_frames(
                video_path, batch_indices, resize_factor
            )

            for idx, frame in zip(batch_indices, batch_frames, strict=False):
                if frame is not None:
                    frames_dict[idx] = frame

        # Step 2: Compute fingerprints in parallel (CPU bound)
        fingerprints = {}

        # Use process pool for CPU-intensive hash computation
        with ProcessPoolExecutor(max_workers=mp.cpu_count()) as executor:
            # Submit all tasks
            future_to_idx = {
                executor.submit(self._compute_fingerprint_worker, frame): idx
                for idx, frame in frames_dict.items()
            }

            # Collect results
            for future in as_completed(future_to_idx):
                idx = future_to_idx[future]
                try:
                    fingerprint = future.result()
                    fingerprints[idx] = fingerprint
                except Exception as e:
                    logger.warning(
                        f"Failed to compute fingerprint for frame {idx}: {e}"
                    )

        # Update cache
        if video_path not in self.fingerprint_cache:
            self.fingerprint_cache[video_path] = {}
        self.fingerprint_cache[video_path].update(fingerprints)

        elapsed = time.time() - start_time
        fps = len(fingerprints) / elapsed if elapsed > 0 else 0
        logger.info(
            f"Computed {len(fingerprints)} fingerprints in {elapsed:.2f}s "
            f"({fps:.1f} fps)"
        )

        return fingerprints

    @staticmethod
    def _compute_fingerprint_worker(frame: np.ndarray) -> dict[str, np.ndarray]:
        """Worker function for parallel fingerprint computation.

        Static method to enable pickling for multiprocessing.
        """
        # Create a new instance in the worker process without logging
        fingerprinter = FrameFingerprinter(log_init=False)
        return fingerprinter.compute_fingerprint(frame)
    
    def compare_fingerprints_batch(
        self, 
        fps1: list[dict[str, np.ndarray]], 
        fps2: list[dict[str, np.ndarray]]
    ) -> np.ndarray:
        """Batch comparison of fingerprints using numba optimization.
        
        Args:
            fps1: List of fingerprints from first set
            fps2: List of fingerprints from second set
            
        Returns:
            Similarity matrix (len(fps1), len(fps2))
        """
        n1, n2 = len(fps1), len(fps2)
        similarities = np.zeros((n1, n2), dtype=np.float32)
        
        if self.use_numba and n1 > 5 and n2 > 5:
            try:
                # Extract hash types from first fingerprint
                hash_types = [k for k in fps1[0].keys() if k != "histogram"]
                
                # Prepare batch arrays for each hash type
                for hash_type in hash_types:
                    if all(hash_type in fp for fp in fps1 + fps2):
                        # Extract hashes for this type
                        hashes1 = np.array([fp[hash_type].flatten() for fp in fps1])
                        hashes2 = np.array([fp[hash_type].flatten() for fp in fps2])
                        
                        # Compute batch distances
                        distances = compute_hamming_distances_batch(
                            hashes1.astype(np.uint8), hashes2.astype(np.uint8)
                        )
                        
                        # Convert to similarities and accumulate
                        max_bits = hashes1.shape[1] * 8
                        hash_similarities = 1.0 - (distances / max_bits)
                        
                        # Apply weight for this hash type
                        weight_map = {
                            "phash": 0.4,
                            "ahash": 0.2,
                            "dhash": 0.2,
                            "mhash": 0.1,
                        }
                        weight = weight_map.get(hash_type, 0.1)
                        similarities += hash_similarities * weight
                
                # Add histogram correlations if available
                if all("histogram" in fp for fp in fps1 + fps2):
                    for i in range(n1):
                        for j in range(n2):
                            hist_corr = compute_histogram_correlation(
                                fps1[i]["histogram"], fps2[j]["histogram"]
                            )
                            similarities[i, j] += max(0, hist_corr) * 0.1
                
                # Normalize by total weight
                total_weight = sum(weight_map.values()) + 0.1  # +0.1 for histogram
                similarities /= total_weight
                
                return similarities
                
            except Exception as e:
                logger.warning(f"Batch comparison failed, falling back: {e}")
        
        # Fallback to individual comparisons
        for i in range(n1):
            for j in range(n2):
                similarities[i, j] = self.compare_fingerprints(fps1[i], fps2[j])
        
        return similarities

    def compute_fingerprints(self, frames: np.ndarray) -> np.ndarray:
        """Compute fingerprints for multiple frames.
        
        Args:
            frames: Array of frames (N, H, W, C) or list of frames
            
        Returns:
            Array of fingerprints as feature vectors
        """
        logger.info(f"Computing fingerprints for {len(frames)} frames...")
        start_time = time.time()
        
        fingerprints = []
        
        # Process frames in batches using multiprocessing
        with ProcessPoolExecutor(max_workers=mp.cpu_count()) as executor:
            # Submit all tasks
            futures = [executor.submit(self._compute_fingerprint_worker, frame) 
                      for frame in frames]
            
            # Collect results in order
            for future in futures:
                try:
                    fingerprint = future.result()
                    # Convert fingerprint dict to feature vector
                    feature_vec = self._fingerprint_to_vector(fingerprint)
                    fingerprints.append(feature_vec)
                except Exception as e:
                    logger.warning(f"Failed to compute fingerprint: {e}")
                    # Add zero vector as fallback
                    fingerprints.append(np.zeros(self._get_fingerprint_size()))
        
        elapsed = time.time() - start_time
        fps = len(fingerprints) / elapsed if elapsed > 0 else 0
        logger.info(
            f"Computed {len(fingerprints)} fingerprints in {elapsed:.2f}s "
            f"({fps:.1f} fps)"
        )
        
        return np.array(fingerprints)
    
    def _fingerprint_to_vector(self, fingerprint: dict[str, np.ndarray]) -> np.ndarray:
        """Convert fingerprint dictionary to feature vector.
        
        Args:
            fingerprint: Dictionary of hash values and histogram
            
        Returns:
            Flattened feature vector
        """
        features = []
        
        # Extract hash values in consistent order
        for hash_name in ["phash", "ahash", "dhash", "mhash"]:
            if hash_name in fingerprint:
                features.append(fingerprint[hash_name].flatten())
        
        # Add histogram if present
        if "histogram" in fingerprint:
            features.append(fingerprint["histogram"])
        
        # Concatenate all features
        if features:
            return np.concatenate(features)
        else:
            return np.zeros(self._get_fingerprint_size())
    
    def _get_fingerprint_size(self) -> int:
        """Get the size of the fingerprint feature vector.
        
        Returns:
            Size of feature vector
        """
        # Compute size based on available hashers
        # Most hashes produce 8-byte (64-bit) values
        size = 0
        for name in ["phash", "ahash", "dhash", "mhash"]:
            if name in self.hashers:
                size += 8  # 8 bytes per hash
        
        # Add histogram size (32 bins * 3 channels)
        size += 32 * 3
        
        return size

    def clear_cache(self, video_path: str | None = None):
        """Clear fingerprint cache.

        Args:
            video_path: Specific video to clear, or None for all
        """
        if video_path:
            self.fingerprint_cache.pop(video_path, None)
        else:
            self.fingerprint_cache.clear()
</file>

<file path="src/vidkompy/core/multi_resolution_aligner.py">
#!/usr/bin/env python3
# this_file: src/vidkompy/core/multi_resolution_aligner.py

"""
Multi-resolution temporal alignment for precise video synchronization.

Implements hierarchical DTW with progressive refinement to eliminate drift.
"""

from dataclasses import dataclass
import numpy as np
from loguru import logger

# Import Savitzky-Golay filter if it's not already imported
from scipy.signal import savgol_filter

from .dtw_aligner import DTWAligner
from .frame_fingerprint import FrameFingerprinter

try:
    from vidkompy.core.numba_optimizations import (
        apply_polynomial_drift_correction,
        apply_savitzky_golay_filter,
    )
    NUMBA_AVAILABLE = True
except ImportError:
    logger.warning("Numba optimizations not available for multi-resolution alignment")
    NUMBA_AVAILABLE = False


@dataclass
class PreciseEngineConfig:
    """Configuration for precise temporal alignment engine."""

    # Sampling parameters
    max_resolutions: int = 4  # Number of resolution levels
    base_resolution: int = 16  # Coarsest sampling rate

    # DTW parameters
    initial_window_ratio: float = 0.25  # Window size for coarse DTW
    refinement_window: int = 30  # Frames for sliding window

    # Quality thresholds
    similarity_threshold: float = 0.85  # Minimum frame similarity
    confidence_threshold: float = 0.9  # High confidence threshold

    # Performance tuning
    max_frames_to_process: int = 10000  # Limit for very long videos
    # Reset alignment every N frames
    drift_correction_interval: int = 100
    # Trust in original map vs linear interpolation
    drift_blend_factor: float = 0.85

    # New parameters for enhanced drift correction & smoothing
    # Options: "linear", "polynomial", "loess" (loess deferred)
    drift_correction_model: str = "polynomial"
    poly_degree: int = 2  # Degree for polynomial regression
    # loess_frac: float = 0.25 # LOESS fraction (requires statsmodels)
    adaptive_blend_factor: bool = True  # Enable adaptive blend factor
    # Savitzky-Golay filter window (odd number)
    savitzky_golay_window: int = 21
    # Savitzky-Golay poly order (< window)
    savitzky_golay_polyorder: int = 3
    # Options: "linear", "spline" (spline deferred)
    interpolation_method: str = "linear"
    cli_dtw_window: int = 0  # DTW window size from CLI, 0 to ignore


class MultiResolutionAligner:
    """Multi-resolution temporal alignment with drift correction."""

    def __init__(
        self,
        fingerprinter: FrameFingerprinter,
        config: PreciseEngineConfig | None = None,
        verbose: bool = False,
    ):
        """Initialize multi-resolution aligner.

        Args:
            fingerprinter: Frame fingerprint generator
            config: Engine configuration
            verbose: Enable detailed logging
        """
        self.fingerprinter = fingerprinter
        self.config = config or PreciseEngineConfig()
        self.verbose = verbose
        self.use_numba = NUMBA_AVAILABLE

        # Calculate resolution levels
        self.resolutions = []
        res = self.config.base_resolution
        for _ in range(self.config.max_resolutions):
            self.resolutions.append(res)
            res = max(1, res // 2)

        logger.info(f"Multi-resolution levels: {self.resolutions}")

    def create_temporal_pyramid(
        self, frames: np.ndarray, fingerprints: np.ndarray | None = None
    ) -> dict[int, np.ndarray]:
        """Create multi-resolution temporal pyramid.

        Args:
            frames: Video frames array
            fingerprints: Pre-computed fingerprints (optional)

        Returns:
            Dictionary mapping resolution to sampled fingerprints
        """
        pyramid = {}

        # Compute fingerprints if not provided
        if fingerprints is None:
            logger.info("Computing fingerprints for temporal pyramid...")
            fingerprints = self.fingerprinter.compute_fingerprints(frames)

        # Sample at each resolution
        for res_val in self.resolutions:
            indices = list(range(0, len(fingerprints), res_val))
            pyramid[res_val] = fingerprints[indices]
            # Resolution 1/{res_val}: {len(pyramid[res_val])} samples
            logger.debug(f"Res 1/{res_val}: {len(pyramid[res_val])} samples")

        return pyramid

    def coarse_alignment(
        self, fg_pyramid: dict[int, np.ndarray], bg_pyramid: dict[int, np.ndarray]
    ) -> np.ndarray:
        """Perform coarse alignment at lowest resolution.

        Args:
            fg_pyramid: Foreground temporal pyramid
            bg_pyramid: Background temporal pyramid

        Returns:
            Initial frame mapping at coarsest resolution
        """
        # Start with coarsest resolution
        coarsest_res = max(self.resolutions)
        fg_coarse = fg_pyramid[coarsest_res]
        bg_coarse = bg_pyramid[coarsest_res]

        # Coarse alignment at 1/{coarsest_res} resolution
        logger.info(f"Coarse align at 1/{coarsest_res}")
        # FG samples: {len(fg_coarse)}, BG samples: {len(bg_coarse)}
        logger.debug(f"FG smpl: {len(fg_coarse)}, BG smpl: {len(bg_coarse)}")

        # Use DTW with large window
        window_size = int(len(bg_coarse) * self.config.initial_window_ratio)
        dtw = DTWAligner(window=window_size)

        # Compute cost matrix
        cost_matrix = dtw._compute_cost_matrix(fg_coarse, bg_coarse)

        # Find optimal path
        path = dtw._compute_path(cost_matrix)

        # Convert path to frame mapping
        mapping = np.zeros(len(fg_coarse), dtype=int)
        for i, (fg_idx, bg_idx) in enumerate(path):
            if fg_idx < len(mapping):
                mapping[fg_idx] = bg_idx

        return mapping

    def refine_alignment(
        self,
        fg_pyramid: dict[int, np.ndarray],
        bg_pyramid: dict[int, np.ndarray],
        coarse_mapping: np.ndarray,
        from_res: int,
        to_res: int,
    ) -> np.ndarray:
        """Refine alignment from one resolution to the next.

        Args:
            fg_pyramid: Foreground temporal pyramid
            bg_pyramid: Background temporal pyramid
            coarse_mapping: Mapping at coarser resolution
            from_res: Source resolution
            to_res: Target resolution (finer)

        Returns:
            Refined frame mapping at target resolution
        """
        # Refining alignment: 1/{from_res} -> 1/{to_res}
        logger.info(f"Refine: 1/{from_res} -> 1/{to_res}")

        # Get fingerprints at target resolution
        fg_fine = fg_pyramid[to_res]
        bg_fine = bg_pyramid[to_res]

        # Calculate scaling factor
        scale = from_res // to_res

        # Initialize refined mapping
        refined_mapping = np.zeros(len(fg_fine), dtype=int)

        # Refine each coarse segment
        for i_seg in range(len(coarse_mapping)):
            # Find corresponding fine-resolution range
            fg_start = i_seg * scale
            fg_end = min((i_seg + 1) * scale, len(fg_fine))

            # Find search range in background
            bg_center = coarse_mapping[i_seg] * scale
            bg_start = max(0, bg_center - self.config.refinement_window)
            bg_search_end = min(len(bg_fine), bg_center + self.config.refinement_window)

            if fg_end <= fg_start or bg_search_end <= bg_start:
                continue

            # Extract segments
            fg_segment = fg_fine[fg_start:fg_end]
            bg_segment = bg_fine[bg_start:bg_search_end]

            # Local DTW alignment
            window = min(len(bg_segment) // 2, 10)
            dtw = DTWAligner(window=window)

            try:
                cost_matrix = dtw._compute_cost_matrix(fg_segment, bg_segment)
                path = dtw._compute_path(cost_matrix)

                # Update refined mapping
                for fg_local, bg_local in path:
                    if fg_local < len(fg_segment):
                        fg_global = fg_start + fg_local
                        bg_global = bg_start + bg_local
                        if fg_global < len(refined_mapping):
                            refined_mapping[fg_global] = bg_global
            except Exception as e:
                # Local refinement failed at segment {i_seg}: {e}
                logger.warning(f"Local refine failed at seg {i_seg}: {e}")
                # Fall back to interpolation
                for j_loop in range(fg_start, fg_end):
                    if j_loop < len(refined_mapping):
                        refined_mapping[j_loop] = bg_center + (j_loop - fg_start)

        return refined_mapping

    def hierarchical_alignment(
        self, fg_pyramid: dict[int, np.ndarray], bg_pyramid: dict[int, np.ndarray]
    ) -> np.ndarray:
        """Perform hierarchical alignment from coarse to fine.

        Args:
            fg_pyramid: Foreground temporal pyramid
            bg_pyramid: Background temporal pyramid

        Returns:
            Frame mapping at finest resolution
        """
        # Start with coarse alignment
        mapping = self.coarse_alignment(fg_pyramid, bg_pyramid)
        current_res = max(self.resolutions)

        # Progressively refine
        for next_res in sorted(self.resolutions, reverse=True)[1:]:
            mapping = self.refine_alignment(
                fg_pyramid, bg_pyramid, mapping, current_res, next_res
            )
            current_res = next_res

        return mapping

    def apply_drift_correction(
        self, mapping: np.ndarray, interval: int | None = None
    ) -> np.ndarray:
        """Apply periodic drift correction to prevent accumulation."""
        if interval is None:
            interval = self.config.drift_correction_interval

        if len(mapping) == 0:
            return mapping  # Return empty if input is empty

        logger.info(f"Drift correction every {interval} frames")
        
        # Try numba optimization for polynomial drift correction
        if self.use_numba and self.config.drift_correction_model == "polynomial":
            try:
                corrected = apply_polynomial_drift_correction(
                    mapping.astype(np.float64),
                    interval,
                    self.config.poly_degree,
                    self.config.drift_blend_factor
                )
                return corrected.astype(int)
            except Exception as e:
                logger.warning(f"Numba drift correction failed: {e}")
                logger.info("Falling back to standard implementation")
        
        # Standard implementation
        corrected = mapping.copy()
        num_segments = len(mapping) // interval + 1

        for seg_idx in range(num_segments):
            start_idx = seg_idx * interval
            end_idx = min((seg_idx + 1) * interval, len(mapping))

            if start_idx >= end_idx:
                continue

            segment_mapping = mapping[start_idx:end_idx]
            segment_indices = np.arange(len(segment_mapping))

            expected_segment_progression: np.ndarray

            if (
                self.config.drift_correction_model == "polynomial"
                and len(segment_mapping) > self.config.poly_degree
            ):
                coeffs = np.polyfit(
                    segment_indices, segment_mapping, self.config.poly_degree
                )
                poly = np.poly1d(coeffs)
                expected_segment_progression = poly(segment_indices)
            else:  # Default to linear if polynomial fails or not selected
                expected_start_val = segment_mapping[0]
                expected_end_val = segment_mapping[-1]
                expected_segment_progression = np.linspace(
                    expected_start_val, expected_end_val, len(segment_mapping)
                )

            current_blend_factor = self.config.drift_blend_factor
            if self.config.adaptive_blend_factor and len(segment_mapping) > 1:
                # Simplified adaptive factor: less trust if variance is high
                variance = np.var(np.diff(segment_mapping))
                # Normalize variance crudely; this needs better scaling
                norm_variance = np.clip(
                    variance / (abs(np.mean(np.diff(segment_mapping))) + 1e-5), 0, 1
                )
                current_blend_factor = self.config.drift_blend_factor * (
                    1 - 0.5 * norm_variance
                )

            for k_loop in range(len(segment_mapping)):
                map_k_idx = start_idx + k_loop
                corrected[map_k_idx] = int(
                    current_blend_factor * segment_mapping[k_loop]
                    + (1 - current_blend_factor) * expected_segment_progression[k_loop]
                )
                drift_val = abs(corrected[map_k_idx] - segment_mapping[k_loop])
                if drift_val > 5 and self.verbose:
                    logger.debug(f"Frame {map_k_idx}: drift corr {drift_val} applied")

        # Ensure final mapping is monotonic
        for i_mono in range(1, len(corrected)):
            corrected[i_mono] = max(corrected[i_mono], corrected[i_mono - 1])
        return corrected

    def interpolate_full_mapping(
        self, sparse_mapping: np.ndarray, target_length: int, source_resolution: int
    ) -> np.ndarray:
        """Interpolate sparse mapping to full frame resolution."""
        full_mapping = np.zeros(target_length, dtype=int)
        if len(sparse_mapping) == 0:
            if target_length > 0:
                logger.warning("Empty sparse_mapping, returning zero mapping.")
            return full_mapping

        # Current method is linear interpolation
        if self.config.interpolation_method == "linear" or True:  # Default to linear
            for i_interp in range(target_length):
                sparse_idx = i_interp // source_resolution
                if sparse_idx >= len(sparse_mapping) - 1:
                    full_mapping[i_interp] = sparse_mapping[-1]
                else:
                    alpha = (i_interp % source_resolution) / source_resolution
                    start_bg = sparse_mapping[sparse_idx]
                    end_bg = sparse_mapping[sparse_idx + 1]
                    full_mapping[i_interp] = int(start_bg + alpha * (end_bg - start_bg))
        # else if self.config.interpolation_method == "spline": # Spline deferred
        # pass

        for i_mono in range(1, len(full_mapping)):
            full_mapping[i_mono] = max(full_mapping[i_mono], full_mapping[i_mono - 1])
        return full_mapping

    def align(
        self,
        fg_frames: np.ndarray,
        bg_frames: np.ndarray,
        fg_fingerprints: np.ndarray | None = None,
        bg_fingerprints: np.ndarray | None = None,
    ) -> tuple[np.ndarray, float]:
        """Perform multi-resolution temporal alignment."""
        logger.info(f"Multi-res align: {len(fg_frames)} -> {len(bg_frames)} frames")

        fg_pyramid = self.create_temporal_pyramid(fg_frames, fg_fingerprints)
        bg_pyramid = self.create_temporal_pyramid(bg_frames, bg_fingerprints)

        sparse_mapping = self.hierarchical_alignment(fg_pyramid, bg_pyramid)

        # Apply enhanced drift correction
        corrected_mapping = self.apply_drift_correction(sparse_mapping)

        # Global smoothing using Savitzky-Golay filter
        if (
            len(corrected_mapping) > self.config.savitzky_golay_window
            and self.config.savitzky_golay_window > 0
        ):
            try:
                # Ensure polyorder is less than window_length
                polyorder = min(
                    self.config.savitzky_golay_polyorder,
                    self.config.savitzky_golay_window - 1,
                )
                if polyorder < 0:
                    polyorder = 0  # Must be non-negative

                smoothed_mapping = savgol_filter(
                    corrected_mapping,
                    window_length=self.config.savitzky_golay_window,
                    polyorder=polyorder,
                ).astype(int)
                # Ensure monotonicity after smoothing
                for i_mono in range(1, len(smoothed_mapping)):
                    smoothed_mapping[i_mono] = max(
                        smoothed_mapping[i_mono], smoothed_mapping[i_mono - 1]
                    )
                corrected_mapping = smoothed_mapping
                logger.info("Applied Savitzky-Golay smoothing to mapping.")
            except Exception as e:
                logger.warning(
                    f"Savitzky-Golay smoothing failed: {e}. Using un-smoothed corrected mapping."
                )
        else:
            logger.info(
                "Skipping Savitzky-Golay smoothing (mapping too short or window disabled)."
            )

        full_mapping = self.interpolate_full_mapping(
            corrected_mapping,
            len(fg_frames),
            self.resolutions[-1],  # Finest resolution used
        )

        if len(full_mapping) > 1:
            differences = np.diff(full_mapping)
            expected_diff = (
                len(bg_frames) / len(fg_frames) if len(fg_frames) > 0 else 1.0
            )
            variance = np.var(differences)
            confidence = np.exp(-variance / (expected_diff**2 + 1e-9))
        elif len(full_mapping) == 1 and len(fg_frames) == 1:
            confidence = 1.0
        else:
            confidence = 0.0

        logger.info(f"Alignment complete. Confidence: {confidence:.3f}")
        return full_mapping, confidence
</file>

<file path="src/vidkompy/core/precise_temporal_alignment.py">
#!/usr/bin/env python3
# this_file: src/vidkompy/core/precise_temporal_alignment.py

"""
Precise temporal alignment implementation with advanced techniques.

Combines multi-resolution alignment, keyframe anchoring, and bidirectional DTW.
"""

from typing import Tuple, Optional, List, Dict
import numpy as np
from loguru import logger
from scipy.signal import find_peaks
from scipy.ndimage import gaussian_filter1d

from .frame_fingerprint import FrameFingerprinter
from .multi_resolution_aligner import MultiResolutionAligner, PreciseEngineConfig
from .dtw_aligner import DTWAligner


class PreciseTemporalAlignment:
    """Precise temporal alignment with drift elimination."""

    def __init__(
        self,
        fingerprinter: FrameFingerprinter,
        verbose: bool = False,
        interval: int = 100,
        cli_window_size: int = 0,
    ):
        """Initialize precise temporal alignment.

        Args:
            fingerprinter: Frame fingerprint generator
            verbose: Enable detailed logging
            interval: Drift correction interval for MultiResolutionAligner
            cli_window_size: DTW window size from CLI (0 means use defaults)
        """
        self.fingerprinter = fingerprinter
        self.verbose = verbose
        self.cli_window_size = cli_window_size

        # Initialize multi-resolution aligner
        config = PreciseEngineConfig(
            max_resolutions=4,
            base_resolution=16,
            drift_correction_interval=interval,
            cli_dtw_window=cli_window_size,
        )
        self.multi_res_aligner = MultiResolutionAligner(fingerprinter, config, verbose)

    def detect_keyframes(
        self, fingerprints: np.ndarray, min_distance: int = 30
    ) -> np.ndarray:
        """Detect keyframes based on temporal changes.

        Args:
            fingerprints: Frame fingerprints
            min_distance: Minimum distance between keyframes

        Returns:
            Indices of detected keyframes
        """
        # Calculate temporal differences
        diffs = np.zeros(len(fingerprints) - 1)
        for i in range(len(fingerprints) - 1):
            diffs[i] = np.linalg.norm(fingerprints[i + 1] - fingerprints[i])

        # Smooth differences
        smoothed = gaussian_filter1d(diffs, sigma=3)

        # Find peaks (scene changes, motion peaks)
        peaks, properties = find_peaks(
            smoothed, distance=min_distance, prominence=np.std(smoothed) * 0.5
        )

        # Always include first and last frames
        keyframes = [0] + list(peaks) + [len(fingerprints) - 1]
        keyframes = sorted(list(set(keyframes)))

        logger.info(f"Detected {len(keyframes)} keyframes")
        return np.array(keyframes)

    def align_keyframes(
        self,
        fg_keyframes: np.ndarray,
        bg_keyframes: np.ndarray,
        fg_fingerprints: np.ndarray,
        bg_fingerprints: np.ndarray,
    ) -> dict[int, int]:
        """Align keyframes between videos.

        Args:
            fg_keyframes: Foreground keyframe indices
            bg_keyframes: Background keyframe indices
            fg_fingerprints: Foreground fingerprints
            bg_fingerprints: Background fingerprints

        Returns:
            Mapping of foreground to background keyframe indices
        """
        # Extract keyframe fingerprints
        fg_kf_prints = fg_fingerprints[fg_keyframes]
        bg_kf_prints = bg_fingerprints[bg_keyframes]

        # Use DTW for keyframe alignment
        dtw = DTWAligner(window=len(bg_keyframes))
        cost_matrix = dtw._compute_cost_matrix(fg_kf_prints, bg_kf_prints)
        path = dtw._compute_path(cost_matrix)

        # Convert to keyframe mapping
        kf_mapping = {}
        for fg_idx, bg_idx in path:
            if fg_idx < len(fg_keyframes) and bg_idx < len(bg_keyframes):
                kf_mapping[fg_keyframes[fg_idx]] = bg_keyframes[bg_idx]

        logger.info(f"Aligned {len(kf_mapping)} keyframe pairs")
        return kf_mapping

    def bidirectional_dtw(
        self,
        fg_fingerprints: np.ndarray,
        bg_fingerprints: np.ndarray,
        window: int = 100,
    ) -> np.ndarray:
        """Perform bidirectional DTW alignment.

        Args:
            fg_fingerprints: Foreground fingerprints
            bg_fingerprints: Background fingerprints
            window: DTW window size

        Returns:
            Averaged bidirectional alignment
        """
        dtw = DTWAligner(window=window)

        # Forward alignment
        logger.debug("Computing forward DTW alignment")
        cost_forward = dtw._compute_cost_matrix(fg_fingerprints, bg_fingerprints)
        path_forward = dtw._compute_path(cost_forward)

        # Backward alignment
        logger.debug("Computing backward DTW alignment")
        cost_backward = dtw._compute_cost_matrix(
            bg_fingerprints[::-1], fg_fingerprints[::-1]
        )
        path_backward = dtw._compute_path(cost_backward)

        # Convert paths to mappings
        forward_mapping = np.zeros(len(fg_fingerprints), dtype=int)
        for fg_idx, bg_idx in path_forward:
            if fg_idx < len(forward_mapping):
                forward_mapping[fg_idx] = bg_idx

        backward_mapping = np.zeros(len(fg_fingerprints), dtype=int)
        for bg_idx, fg_idx in path_backward:
            # Reverse indices
            bg_idx = len(bg_fingerprints) - 1 - bg_idx
            fg_idx = len(fg_fingerprints) - 1 - fg_idx
            if fg_idx >= 0 and fg_idx < len(backward_mapping):
                backward_mapping[fg_idx] = bg_idx

        # Average the mappings
        averaged_mapping = (forward_mapping + backward_mapping) // 2

        # Ensure monotonicity
        for i in range(1, len(averaged_mapping)):
            averaged_mapping[i] = max(averaged_mapping[i], averaged_mapping[i - 1])

        return averaged_mapping

    def refine_with_sliding_window(
        self,
        initial_mapping: np.ndarray,
        fg_fingerprints: np.ndarray,
        bg_fingerprints: np.ndarray,
        window_size: int = 30,
        search_range: int = 10,
    ) -> np.ndarray:
        """Refine alignment using sliding window approach.

        Args:
            initial_mapping: Initial frame mapping
            fg_fingerprints: Foreground fingerprints
            bg_fingerprints: Background fingerprints
            window_size: Size of sliding window
            search_range: Search range for refinement

        Returns:
            Refined frame mapping
        """
        refined_mapping = initial_mapping.copy()
        num_windows = len(fg_fingerprints) // window_size + 1

        logger.info(f"Sliding window refinement: {num_windows} windows")

        for w in range(num_windows):
            start = w * window_size
            end = min((w + 1) * window_size, len(fg_fingerprints))

            if end <= start:
                continue

            # Get window fingerprints
            fg_window = fg_fingerprints[start:end]

            # Determine search range in background
            bg_center = initial_mapping[start]
            bg_start = max(0, bg_center - search_range)
            bg_end = min(len(bg_fingerprints), bg_center + search_range + window_size)

            if bg_end <= bg_start:
                continue

            bg_window = bg_fingerprints[bg_start:bg_end]

            # Find best alignment within window
            best_offset = 0
            best_score = float("inf")

            for offset in range(
                min(search_range * 2, len(bg_window) - len(fg_window) + 1)
            ):
                score = 0
                for i in range(len(fg_window)):
                    if offset + i < len(bg_window):
                        score += np.linalg.norm(fg_window[i] - bg_window[offset + i])

                if score < best_score:
                    best_score = score
                    best_offset = offset

            # Apply refinement
            for i in range(start, end):
                if i < len(refined_mapping):
                    refined_mapping[i] = bg_start + best_offset + (i - start)

        # Smooth transitions between windows
        refined_mapping = gaussian_filter1d(refined_mapping.astype(float), sigma=5)
        refined_mapping = refined_mapping.astype(int)

        # Ensure monotonicity
        for i in range(1, len(refined_mapping)):
            refined_mapping[i] = max(refined_mapping[i], refined_mapping[i - 1])

        return refined_mapping

    def compute_alignment_confidence(
        self,
        mapping: np.ndarray,
        fg_fingerprints: np.ndarray,
        bg_fingerprints: np.ndarray,
    ) -> float:
        """Compute confidence score for alignment.

        Args:
            mapping: Frame mapping
            fg_fingerprints: Foreground fingerprints
            bg_fingerprints: Background fingerprints

        Returns:
            Confidence score (0-1)
        """
        similarities = []

        for fg_idx, bg_idx in enumerate(mapping):
            if bg_idx < len(bg_fingerprints):
                sim = 1.0 - np.linalg.norm(
                    fg_fingerprints[fg_idx] - bg_fingerprints[bg_idx]
                ) / (np.linalg.norm(fg_fingerprints[fg_idx]) + 1e-8)
                similarities.append(sim)

        if similarities:
            mean_sim = np.mean(similarities)
            std_sim = np.std(similarities)
            # High confidence = high mean similarity and low variance
            confidence = mean_sim * np.exp(-std_sim)
            return float(np.clip(confidence, 0, 1))

        return 0.0

    def align(
        self, fg_frames: np.ndarray, bg_frames: np.ndarray
    ) -> tuple[np.ndarray, float]:
        """Perform precise temporal alignment.

        Args:
            fg_frames: Foreground video frames
            bg_frames: Background video frames

        Returns:
            Frame mapping and alignment confidence
        """
        logger.info("Starting precise temporal alignment")
        logger.info(f"FG: {len(fg_frames)} frames, BG: {len(bg_frames)} frames")

        # Compute fingerprints
        logger.info("Computing frame fingerprints...")
        fg_fingerprints = self.fingerprinter.compute_fingerprints(fg_frames)
        bg_fingerprints = self.fingerprinter.compute_fingerprints(bg_frames)

        # Method 1: Multi-resolution alignment
        logger.info("Phase 1: Multi-resolution alignment")
        multi_res_mapping, multi_res_conf = self.multi_res_aligner.align(
            fg_frames, bg_frames, fg_fingerprints, bg_fingerprints
        )

        # Method 2: Keyframe-based alignment
        logger.info("Phase 2: Keyframe detection and alignment")
        fg_keyframes = self.detect_keyframes(fg_fingerprints)
        bg_keyframes = self.detect_keyframes(bg_fingerprints)
        keyframe_mapping = self.align_keyframes(
            fg_keyframes, bg_keyframes, fg_fingerprints, bg_fingerprints
        )

        # Method 3: Bidirectional DTW on reduced samples
        logger.info("Phase 3: Bidirectional DTW refinement")
        sample_rate = max(1, len(fg_frames) // 500)  # Max 500 samples
        fg_sampled = fg_fingerprints[::sample_rate]
        bg_sampled = bg_fingerprints[::sample_rate]

        bidirectional_mapping = self.bidirectional_dtw(
            fg_sampled, bg_sampled, window=50
        )

        # Interpolate bidirectional mapping to full resolution
        full_bidir_mapping = np.interp(
            np.arange(len(fg_frames)),
            np.arange(len(bidirectional_mapping)) * sample_rate,
            bidirectional_mapping * sample_rate,
        ).astype(int)

        # Combine methods with weighted average
        combined_mapping = (0.5 * multi_res_mapping + 0.5 * full_bidir_mapping).astype(
            int
        )

        # Apply keyframe constraints
        for fg_kf, bg_kf in keyframe_mapping.items():
            combined_mapping[fg_kf] = bg_kf

        # Interpolate between keyframes
        keyframe_indices = sorted(keyframe_mapping.keys())
        for i in range(len(keyframe_indices) - 1):
            start_fg = keyframe_indices[i]
            end_fg = keyframe_indices[i + 1]
            start_bg = keyframe_mapping[start_fg]
            end_bg = keyframe_mapping[end_fg]

            # Linear interpolation between keyframes
            for j in range(start_fg + 1, end_fg):
                alpha = (j - start_fg) / (end_fg - start_fg)
                combined_mapping[j] = int(start_bg + alpha * (end_bg - start_bg))

        # Phase 4: Sliding window refinement
        logger.info("Phase 4: Sliding window refinement")
        refined_mapping = self.refine_with_sliding_window(
            combined_mapping, fg_fingerprints, bg_fingerprints
        )

        # Ensure final mapping is within bounds
        refined_mapping = np.clip(refined_mapping, 0, len(bg_frames) - 1)

        # Compute final confidence
        confidence = self.compute_alignment_confidence(
            refined_mapping, fg_fingerprints, bg_fingerprints
        )

        logger.info(f"Precise alignment complete. Confidence: {confidence:.3f}")

        return refined_mapping, confidence
</file>

<file path="src/vidkompy/__version__.py">
# file generated by setuptools-scm
# don't change, don't track in version control

__all__ = ["__version__", "__version_tuple__", "version", "version_tuple"]

TYPE_CHECKING = False
if TYPE_CHECKING:
    VERSION_TUPLE = tuple[int | str, ...]
else:
    VERSION_TUPLE = object

version: str
__version__: str
__version_tuple__: VERSION_TUPLE
version_tuple: VERSION_TUPLE

__version__ = version = "0.0.post3+g39be67c.d20250524"
__version_tuple__ = version_tuple = (0, 0, "post3", "g39be67c.d20250524")
</file>

<file path="src/vidkompy/models.py">
#!/usr/bin/env python3
# this_file: src/vidkompy/models.py

"""
Data models and enums for vidkompy.

Contains all shared data structures used across the application.
"""

from dataclasses import dataclass
from enum import Enum


class MatchTimeMode(Enum):
    """Temporal alignment modes."""

    BORDER = "border"  # Border-based matching (default)
    PRECISE = "precise"  # Use frame-based matching


class SpatialMethod(Enum):
    """Spatial alignment methods."""

    TEMPLATE = "template"  # Template matching (only method)
    CENTER = "center"  # Simple center alignment (fallback)


class TemporalMethod(Enum):
    """Temporal alignment algorithm methods."""

    DTW = "dtw"  # Dynamic Time Warping (new default)
    CLASSIC = "classic"  # Original keyframe matching
    FRAMES = "frames"  # Legacy alias for classic


@dataclass
class VideoInfo:
    """Video metadata container."""

    width: int
    height: int
    fps: float
    duration: float
    frame_count: int
    has_audio: bool
    audio_sample_rate: int | None = None
    audio_channels: int | None = None
    path: str = ""

    @property
    def resolution(self) -> tuple[int, int]:
        """Get video resolution as (width, height)."""
        return (self.width, self.height)

    @property
    def aspect_ratio(self) -> float:
        """Calculate aspect ratio."""
        return self.width / self.height if self.height > 0 else 0


@dataclass
class FrameAlignment:
    """Represents alignment between a foreground and background frame."""

    fg_frame_idx: int  # Foreground frame index (never changes)
    bg_frame_idx: int  # Corresponding background frame index
    similarity_score: float  # Similarity between frames (0-1)

    def __repr__(self) -> str:
        return f"FrameAlignment(fg={self.fg_frame_idx}, bg={self.bg_frame_idx}, sim={self.similarity_score:.3f})"


@dataclass
class SpatialAlignment:
    """Spatial offset for overlaying foreground on background."""

    x_offset: int
    y_offset: int
    scale_factor: float = 1.0
    confidence: float = 1.0

    @property
    def offset(self) -> tuple[int, int]:
        """Get offset as tuple."""
        return (self.x_offset, self.y_offset)


@dataclass
class TemporalAlignment:
    """Temporal alignment results."""

    offset_seconds: float  # Time offset in seconds
    frame_alignments: list[FrameAlignment]  # Frame-by-frame mapping
    method_used: str  # Method that produced this alignment
    confidence: float = 1.0

    @property
    def start_frame(self) -> int | None:
        """Get first aligned foreground frame."""
        return self.frame_alignments[0].fg_frame_idx if self.frame_alignments else None

    @property
    def end_frame(self) -> int | None:
        """Get last aligned foreground frame."""
        return self.frame_alignments[-1].fg_frame_idx if self.frame_alignments else None


@dataclass
class ProcessingOptions:
    """Options for video processing."""

    time_mode: MatchTimeMode
    space_method: str
    skip_spatial: bool
    trim: bool
    max_keyframes: int = 2000
    verbose: bool = False
    border_thickness: int = 8
    blend: bool = False
    window: int = 0
</file>

<file path=".cursorrules">
## Coding style

<guidelines>
# When you write code

- Iterate gradually, avoiding major changes
- Minimize confirmations and checks
- Preserve existing code/structure unless necessary
- Use constants over magic numbers
- Check for existing solutions in the codebase before starting
- Check often the coherence of the code you’re writing with the rest of the code.
- Focus on minimal viable increments and ship early
- Write explanatory docstrings/comments that explain what and WHY this does, explain where and how the code is used/referred to elsewhere in the code
- Analyze code line-by-line
- Handle failures gracefully with retries, fallbacks, user guidance
- Address edge cases, validate assumptions, catch errors early
- Let the computer do the work, minimize user decisions
- Reduce cognitive load, beautify code
- Modularize repeated logic into concise, single-purpose functions
- Favor flat over nested structures
- Consistently keep, document, update and consult the holistic overview mental image of the codebase. 

Work in rounds: 

- Create `PROGRESS.md` as a detailed flat plan with `[ ]` items. 
- Identify the most important TODO items, and create `TODO.md` with `[ ]` items. 
- Implement the changes. 
- Update `PROGRESS.md` and `TODO.md` as you go. 
- After each round of changes, update `CHANGELOG.md` with the changes.
- Update `README.md` to reflect the changes.

## Keep track of paths

In each source file, maintain the up-to-date `this_file` record that shows the path of the current file relative to project root. Place the `this_file` record near the top of the file, as a comment after the shebangs, or in the YAML Markdown frontmatter.

## When you write Python

- Use `uv pip`, never `pip`
- Use `python -m` when running code
- PEP 8: Use consistent formatting and naming
- Write clear, descriptive names for functions and variables
- PEP 20: Keep code simple and explicit. Prioritize readability over cleverness
- Use type hints in their simplest form (list, dict, | for unions)
- PEP 257: Write clear, imperative docstrings
- Use f-strings. Use structural pattern matching where appropriate
- ALWAYS add "verbose" mode logugu-based logging, & debug-log
- For CLI Python scripts, use fire & rich, and start the script with

```
#!/usr/bin/env -S uv run -s
# /// script
# dependencies = ["PKG1", "PKG2"]
# ///
# this_file: PATH_TO_CURRENT_FILE
```

Ask before extending/refactoring existing code in a way that may add complexity or break things.

When you’re finished, print "Wait, but" to go back, think & reflect, revise & improvement what you’ve done (but don’t invent functionality freely). Repeat this. But stick to the goal of "minimal viable next version". Lead two experts: "Ideot" for creative, unorthodox ideas, and "Critin" to critique flawed thinking and moderate for balanced discussions. The three of you shall illuminate knowledge with concise, beautiful responses, process methodically for clear answers, collaborate step-by-step, sharing thoughts and adapting. If errors are found, step back and focus on accuracy and progress.

## After Python changes run:

```
fd -e py -x autoflake {}; fd -e py -x pyupgrade --py312-plus {}; fd -e py -x ruff check --output-format=github --fix --unsafe-fixes {}; fd -e py -x ruff format --respect-gitignore --target-version py312 {}; python -m pytest;
```

Be creative, diligent, critical, relentless & funny!
</guidelines>


# `vidkompy`

**Intelligent Video Overlay and Synchronization**

`vidkompy` is a powerful command-line tool engineered to overlay a foreground video onto a background video with exceptional precision and automatic alignment. The system intelligently handles discrepancies in resolution, frame rate, duration, and audio, prioritizing content integrity and synchronization accuracy over raw processing speed.

The core philosophy of `vidkompy` is to treat the **foreground video as the definitive source of quality and timing**. All its frames are preserved without modification or re-timing. The background video is dynamically adapted—stretched, retimed, and selectively sampled—to synchronize perfectly with every frame of the foreground content, ensuring a seamless and coherent final output.

---

## Features

- **Automatic Spatial Alignment**: Intelligently detects the optimal x/y offset to position the foreground video within the background, even if they are cropped differently.
- **Advanced Temporal Synchronization**: Aligns videos with different start times, durations, and frame rates, eliminating temporal drift and ensuring content matches perfectly over time.
- **Foreground-First Principle**: Guarantees that every frame of the foreground video is included in the output, preserving its original timing and quality. The background video is adapted to match the foreground.
- **Drift-Free Alignment**: Utilizes Dynamic Time Warping (DTW) to create a globally optimal, monotonic alignment, preventing the common "drift-and-catchup" artifacts seen with simpler methods.
- **High-Performance Processing**: Leverages multi-core processing, perceptual hashing, and optimized video I/O to deliver results quickly.
- Frame fingerprinting is 100-1000x faster than traditional pixel-wise comparison.
- Sequential video composition is 10-100x faster than random-access methods.
- **Smart Audio Handling**: Automatically uses the foreground audio track if available, falling back to the background audio. The audio is correctly synchronized with the final video.
- **Flexible Operation Modes**: Supports specialized modes like `border` matching for aligning content based on visible background edges, and `smooth` blending for seamless visual integration.

## How It Works

The `vidkompy` pipeline is a multi-stage process designed for precision and accuracy:

1.  **Video Analysis**: The tool begins by probing both background (BG) and foreground (FG) videos using `ffprobe` to extract essential metadata: resolution, frames per second (FPS), duration, frame count, and audio stream information.

2.  **Spatial Alignment**: To determine _where_ to place the foreground on the background, `vidkompy` extracts a sample frame from the middle of each video (where content is most likely to be stable). It then calculates the optimal (x, y) offset.

3.  **Temporal Alignment**: This is the core of `vidkompy`. To determine _when_ to start the overlay and how to map frames over time, the tool generates "fingerprints" of frames from both videos and uses Dynamic Time Warping (DTW) to find the best alignment path. This ensures every foreground frame is matched to the most suitable background frame.

4.  **Video Composition**: Once the spatial and temporal alignments are known, `vidkompy` composes the final video. It reads both video streams sequentially (for maximum performance) and, for each foreground frame, fetches the corresponding background frame as determined by the alignment map. The foreground is then overlaid at the correct spatial position.

5.  **Audio Integration**: After the silent video is composed, `vidkompy` adds the appropriate audio track (preferring the foreground's audio) with the correct offset to ensure it's perfectly synchronized with the video content.

## The Algorithms

`vidkompy` employs several sophisticated algorithms to achieve its high-precision results.

### Frame Fingerprinting (Perceptual Hashing)

**TLDR:** Instead of comparing the millions of pixels in a frame, `vidkompy` creates a tiny, unique "fingerprint" (a hash) for each frame. Comparing these small fingerprints is thousands of times faster and smart enough to ignore minor changes from video compression.

---

The `FrameFingerprinter` module is designed for ultra-fast and robust frame comparison. It uses perceptual hashing, which generates a compact representation of a frame's visual structure.

The process works as follows:

1.  **Standardization**: The input frame is resized to a small, standard size (e.g., 64x64 pixels) and converted to grayscale. This ensures consistency and focuses on structural information over color.
2.  **Multi-Algorithm Hashing**: To improve robustness, `vidkompy` computes several types of perceptual hashes for each frame, as different algorithms are sensitive to different visual features:
- `pHash` (Perceptual Hash): Analyzes the frequency domain (using DCT), making it robust to changes in brightness, contrast, and gamma correction.
- `AverageHash`: Computes a hash based on the average color of the frame.
- `ColorMomentHash`: Captures the color distribution statistics of the frame.
- `MarrHildrethHash`: Detects edges and shapes, making it sensitive to structural features.
3.  **Combined Fingerprint**: The results from these hashers, along with a color histogram, are combined into a single "fingerprint" dictionary for the frame.
4.  **Comparison**: To compare two frames, their fingerprints are compared. The similarity is calculated using a weighted average of the normalized Hamming distance between their hashes and the correlation between their histograms. The weights are tuned based on the reliability of each hash type for video content. This entire process is parallelized across multiple CPU cores for maximum speed.

### Spatial Alignment (Template Matching)

**TLDR:** To find the correct position for the foreground video, the tool takes a screenshot from the middle of it and searches for that exact image within a screenshot from the background video.

---

Spatial alignment determines the `(x, y)` coordinates at which to overlay the foreground frame onto the background. `vidkompy` uses a highly accurate and efficient method based on template matching.

1.  **Frame Selection**: A single frame is extracted from the temporal midpoint of both the foreground and background videos. This is done to get a representative frame, avoiding potential opening/closing titles or black frames.
2.  **Grayscale Conversion**: The frames are converted to grayscale. This speeds up the matching process by 3x and makes the alignment more robust to minor color variations between the videos.
3.  **Template Matching**: The core of the alignment is `cv2.matchTemplate` using the `TM_CCOEFF_NORMED` method. This function effectively "slides" the smaller foreground frame image across the larger background frame image and calculates a normalized cross-correlation score at each position.
4.  **Locating the Best Match**: The position with the highest correlation score (from `cv2.minMaxLoc`) is considered the best match. This location `(x_offset, y_offset)` represents the top-left corner where the foreground should be placed. The confidence of this match is the correlation score itself, which typically approaches `1.0` for a perfect match.
5.  **Scaling**: The system checks if the foreground video is larger than the background. If so, it is scaled down to fit, and the scale factor is recorded.

### Temporal Alignment Engines

**TLDR:** `vidkompy` offers two temporal alignment engines: **Fast** for quick processing with good results, and **Precise** for maximum accuracy with advanced drift correction. Both find the optimal "path" through time that perfectly syncs the foreground to the background.

---

Temporal alignment is the most critical and complex part of `vidkompy`. The goal is to create a mapping `FrameAlignment(fg_frame_idx, bg_frame_idx)` for every single foreground frame. `vidkompy` provides two distinct engines for this task:

#### Fast Engine (Default)

The **Fast Engine** uses **Dynamic Time Warping (DTW)** with perceptual hashing for efficient alignment:

1.  **Frame Sampling & Fingerprinting**: The tool samples frames sparsely based on the `max_keyframes` parameter and computes their perceptual fingerprints using multiple hash algorithms (pHash, AverageHash, ColorMomentHash, MarrHildrethHash).
2.  **Cost Matrix Construction**: A cost matrix is built where `cost(i, j)` is the "distance" (i.e., `1.0 - similarity`) between the fingerprint of foreground frame `i` and background frame `j`.
3.  **DTW with Constraints**: The DTW algorithm finds the lowest-cost path through this matrix with:
   - **Monotonicity**: The path can only move forward in time, preventing temporal jumps
   - **Sakoe-Chiba Band**: Constrains the search to a window around the diagonal (reduces complexity from O(N²) to O(N×w))
4.  **Direct Mapping Mode**: With `max_keyframes=1` (default in fast mode), the engine forces direct frame mapping to eliminate drift entirely.
5.  **Interpolation**: For sparse sampling, the engine linearly interpolates between matched keyframes to create a complete alignment map.

**Characteristics:**
- Processing time: ~15 seconds for an 8-second video
- Minimal drift with direct mapping mode
- Suitable for most use cases

#### Precise Engine (Advanced)

The **Precise Engine** implements a sophisticated multi-resolution approach for maximum accuracy:

1.  **Multi-Resolution Hierarchical Alignment**:
   - Creates temporal pyramids at multiple resolutions (1/16, 1/8, 1/4, 1/2, full)
   - Performs coarse-to-fine alignment, starting at the lowest resolution
   - Each level refines the previous level's mapping
   - Applies drift correction every 100 frames

2.  **Keyframe Detection and Anchoring**:
   - Automatically detects keyframes based on temporal changes using Gaussian filtering
   - Aligns keyframes between videos as anchor points
   - Forces alignment at keyframes to prevent long-range drift
   - Detects scene changes and content transitions

3.  **Bidirectional DTW**:
   - Runs DTW in both forward and backward directions
   - Averages the two alignment paths to reduce systematic bias
   - Provides more robust alignment for videos with varying content

4.  **Sliding Window Refinement**:
   - Refines alignment in 30-frame windows
   - Searches locally for optimal alignment adjustments
   - Applies Gaussian smoothing for smooth transitions
   - Ensures strict monotonicity throughout

5.  **Confidence-Based Weighting**:
   - Computes confidence scores for each alignment
   - Weights multiple alignment methods based on their confidence
   - Combines results for optimal accuracy

**Characteristics:**
- Processing time: ~5 minutes for an 8-second video (includes full frame extraction)
- Virtually eliminates all temporal drift
- Handles complex scenarios with varying frame rates and content changes
- Best for critical applications requiring perfect synchronization

#### Engine Comparison

| Aspect | Fast Engine | Precise Engine |
|--------|-------------|----------------|
| **Algorithm** | Single-pass DTW with perceptual hashing | Multi-resolution hierarchical alignment |
| **Processing Time** | ~2x real-time | ~40x real-time |
| **Drift Handling** | Direct mapping (no drift) or interpolation | Active correction + keyframe anchoring |
| **Frame Extraction** | On-demand during composition | Full extraction before alignment |
| **Memory Usage** | Low (streaming) | High (all frames in memory) |
| **Accuracy** | Good, minimal drift at endpoints | Excellent, no drift throughout |
| **Best For** | Quick processing, standard videos | Critical applications, complex content |

## Usage

### Prerequisites

You must have the **FFmpeg** binary installed on your system and accessible in your system's `PATH`. `vidkompy` depends on it for all video and audio processing tasks.

### Installation

The tool is a Python package. It is recommended to install it from the repository to get the latest version.

```bash
# Clone the repository
git clone https://github.com/twardoch/vidkompy.git
cd vidkompy

# Install using uv (or pip)
uv pip install .
```

### Command-Line Interface (CLI)

The tool is run from the command line, providing paths to the background and foreground videos.

**Basic Examples:**

```bash
# Fast engine with direct mapping (default, no drift)
python -m vidkompy --bg background.mp4 --fg foreground.mp4

# Precise engine for maximum accuracy (slower but perfect sync)
python -m vidkompy --bg background.mp4 --fg foreground.mp4 --engine precise

# Custom output path
python -m vidkompy --bg bg.mp4 --fg fg.mp4 --output result.mp4
```

**CLI Help:**

```
INFO: Showing help with the command '__main__.py -- --help'.

NAME
    __main__.py - Overlay foreground video onto background video with intelligent alignment.

SYNOPSIS
    __main__.py BG FG <flags>

DESCRIPTION
    Overlay foreground video onto background video with intelligent alignment.

POSITIONAL ARGUMENTS
    BG
        Type: str | pathlib.Path
        Background video path
    FG
        Type: str | pathlib.Path
        Foreground video path

FLAGS
    -o, --output=OUTPUT
        Type: Optional[str | pathlib...
        Default: None
        Output video path (auto-generated if not provided)
    -e, --engine=ENGINE
        Type: str
        Default: 'fast'
        Temporal alignment engine - 'fast' (current) or 'precise' (coming soon) (default: 'fast')
    -m, --margin=MARGIN
        Type: int
        Default: 8
        Border thickness for border matching mode (default: 8)
    -s, --smooth=SMOOTH
        Type: bool
        Default: False
        Enable smooth blending at frame edges
    -g, --gpu=GPU
        Type: bool
        Default: False
        Enable GPU acceleration (future feature)
    -v, --verbose=VERBOSE
        Type: bool
        Default: False
        Enable verbose logging

NOTES
    You can also use flags syntax for POSITIONAL ARGUMENTS
```

## Performance

Recent updates have significantly improved `vidkompy`'s performance and accuracy:

### Real-World Performance Comparison

Based on actual benchmarks with an 8-second test video (1920x1080 background, 1920x870 foreground, ~480 frames):

| Engine | Processing Time | Speed Ratio | Drift at 1s | Drift at End | Notes |
|--------|----------------|-------------|-------------|--------------|-------|
| **Fast (default)** | 15.8 seconds | ~2x real-time | Minimal | Minimal | Direct mapping prevents drift |
| **Precise** | 5m 18s | ~40x real-time | Less drift | Minimal | Full frame extraction + multi-resolution |

**Key Performance Insights:**

- **Fast Engine**: Processes at approximately 2x real-time speed. With `max_keyframes=1` (default), it uses direct frame mapping which completely eliminates drift while maintaining fast performance.

- **Precise Engine**: While significantly slower (~40x real-time), it provides superior alignment accuracy, especially for complex videos. Interestingly, it shows less drift at the 1-second mark compared to the fast engine, though both engines perform well at video endpoints.

### Technical Optimizations

- **Drift Elimination**: The fast engine now defaults to `max_keyframes=1`, forcing direct frame-to-frame mapping that eliminates temporal drift entirely.
- **Optimized Compositing**: Sequential frame reading instead of random access yields a **10-100x speedup** in the final composition stage.
- **Parallel Processing**: Frame fingerprinting and cost matrix computation leverage all available CPU cores.
- **Perceptual Hashing**: Frame comparison is **100-1000x faster** than pixel-wise methods while maintaining accuracy.
- **Memory Efficiency**: The fast engine uses streaming processing, while the precise engine trades memory for accuracy by loading all frames.

### Choosing the Right Engine

**Use the Fast Engine (default) when:**
- You need quick results (2x real-time processing)
- The videos are already reasonably synchronized
- Minor imperfections are acceptable
- Processing many videos in batch

**Use the Precise Engine when:**
- Perfect synchronization is critical
- Videos have complex timing variations
- Content quality justifies longer processing time
- Working with professionally edited content

## Development

To contribute to `vidkompy`, set up a development environment using `hatch`.

### Setup

1.  Clone the repository.
2.  Ensure you have `hatch` installed (`pip install hatch`).
3.  The project is managed through `hatch` environments defined in `pyproject.toml`.

### Key Commands

Run these commands from the root of the repository.

- **Run Tests**:

```bash
hatch run test
```

- **Run Tests with Coverage Report**:

```bash
hatch run test-cov
```

- **Run Type Checking**:

```bash
hatch run type-check
```

- **Check Formatting and Linting**:

```bash
hatch run lint
```

- **Automatically Fix Formatting and Linting Issues**:

```bash
hatch run fix
```

## License

This project is licensed under the MIT License. See the [LICENSE](https://www.google.com/search?q=LICENSE) file for details.



START SPECIFICATION:
---
description: Create overview documentation for projects focused on video processing, synchronization, and alignment, particularly when dealing with foreground/background video composition and intelligent temporal matching
globs: *.py,*.md
alwaysApply: false
---


# main-overview

## Development Guidelines

- Only modify code directly relevant to the specific request. Avoid changing unrelated functionality.
- Never replace code with placeholders like `# ... rest of the processing ...`. Always include complete code.
- Break problems into smaller steps. Think through each step separately before implementing.
- Always provide a complete PLAN with REASONING based on evidence from code and logs before making changes.
- Explain your OBSERVATIONS clearly, then provide REASONING to identify the exact issue. Add console logs when needed to gather more information.


The vidkompy system implements intelligent video overlay and synchronization with three core business domains:

## Frame Analysis System (Importance: 95)
- Perceptual hashing engine combines multiple algorithms:
  - Frequency domain analysis (pHash)
  - Brightness patterns (AverageHash) 
  - Color distribution statistics (ColorMomentHash)
  - Edge detection (MarrHildrethHash)
- Weighted fingerprint generation for optimal frame matching
- Border region masking for edge-based alignment

## Temporal Alignment Engine (Importance: 98)
- Dynamic Time Warping (DTW) implementation with:
  - Sakoe-Chiba band constraints
  - Monotonic frame mapping
  - Keyframe interpolation
- Adaptive keyframe density calculation
- Foreground-first preservation principle
- Frame sequence validation

## Spatial Positioning System (Importance: 92)
- Template matching with normalized correlation
- Auto-scaling for resolution mismatches
- Border mode alignment options
- Position confidence scoring
- Center alignment fallback

## Video Composition Rules (Importance: 85)
- Foreground frame preservation guarantee
- Background frame warping/adaptation
- Audio stream selection logic
- Quality-based decision system
- Temporal drift prevention

Core Integration Points:
```
src/vidkompy/
  ├── core/
  │   ├── alignment_engine.py    # Orchestration logic
  │   ├── dtw_aligner.py        # Temporal matching
  │   ├── frame_fingerprint.py  # Frame analysis
  │   └── spatial_alignment.py  # Position detection
```

$END$
END SPECIFICATION
</file>

<file path="pyproject.toml">
[project]
name = 'vidkompy'
dynamic = ['version']
description = ''
readme = 'README.md'
requires-python = '>=3.10'
license = 'MIT'
keywords = []
classifiers = [
    'Development Status :: 4 - Beta',
    'Programming Language :: Python',
    'Programming Language :: Python :: 3.10',
    'Programming Language :: Python :: 3.11',
    'Programming Language :: Python :: 3.12',
    'Programming Language :: Python :: Implementation :: CPython',
    'Programming Language :: Python :: Implementation :: PyPy',
]
dependencies = [
    'fire',
    'rich',
    'loguru',
    'opencv-contrib-python',
    'numpy',
    'scipy',
    'ffmpeg-python',
    'soundfile',
    'scikit-image',
    'numba>=0.58.0',
]

[[project.authors]]
name = 'Adam Twardoch'
email = 'adam+github@twardoch.com'

[project.optional-dependencies]
dev = [
    'pre-commit>=4.2.0',
    'ruff>=0.1.0',
    'mypy>=1.0.0',
    'pyupgrade>=3.19.0',
]
test = [
    'pytest>=8.3.5',
    'pytest-cov>=6.1.1',
]
all = [
    'fire',
    'rich',
    'loguru',
    'opencv-contrib-python',
    'numpy',
    'scipy',
    'ffmpeg-python',
    'soundfile',
    'scikit-image',
    'pre-commit>=4.2.0',
    'ruff>=0.1.0',
    'mypy>=1.0.0',
    'pyupgrade>=3.19.0',
    'pytest>=8.3.5',
    'pytest-cov>=6.1.1',
    'hatchling>=1.21.0',
    'hatch-vcs>=0.3.0',
]

[project.scripts]

[project.urls]
Documentation = 'https://github.com/twardoch/vidkompy#readme'
Issues = 'https://github.com/twardoch/vidkompy/issues'
Source = 'https://github.com/twardoch/vidkompy'

[build-system]
build-backend = 'hatchling.build'
requires = [
    'hatchling>=1.21.0',
    'hatch-vcs>=0.3.0',
]
[tool.coverage.paths]
vidkompy = [
    'src/vidkompy',
    '*/vidkompy/src/vidkompy',
]
tests = [
    'tests',
    '*/vidkompy/tests',
]

[tool.coverage.report]
exclude_lines = [
    'no cov',
    'if __name__ == .__main__.:',
    'if TYPE_CHECKING:',
]

[tool.coverage.run]
source_pkgs = [
    'vidkompy',
    'tests',
]
branch = true
parallel = true
omit = ['src/vidkompy/__about__.py']
[tool.hatch.build.hooks.vcs]
version-file = 'src/vidkompy/__version__.py'
[tool.hatch.build.targets.wheel]
packages = ['src/vidkompy']
[tool.hatch.envs.default]
dependencies = []

[tool.hatch.envs.default.scripts]
test = 'pytest {args:tests}'
test-cov = 'pytest --cov-report=term-missing --cov-config=pyproject.toml --cov=src/vidkompy --cov=tests {args:tests}'
type-check = 'mypy src/vidkompy tests'
lint = [
    'ruff check src/vidkompy tests',
    'ruff format --respect-gitignore src/vidkompy tests',
]
fix = [
    'ruff check  --fix --unsafe-fixes src/vidkompy tests',
    'ruff format --respect-gitignore src/vidkompy tests',
]
[[tool.hatch.envs.all.matrix]]
python = [
    '3.10',
    '3.11',
    '3.12',
]

[tool.hatch.envs.lint]
detached = true
dependencies = []

[tool.hatch.envs.lint.scripts]
typing = 'mypy --install-types --non-interactive {args:src/vidkompy tests}'
style = [
    'ruff check {args:.}',
    'ruff format --respect-gitignore {args:.}',
]
fmt = [
    'ruff format --respect-gitignore {args:.}',
    'ruff check --fix {args:.}',
]
all = [
    'style',
    'typing',
]

[tool.hatch.envs.test]
dependencies = []

[tool.hatch.envs.test.scripts]
test = 'python -m pytest -n auto -p no:briefcase {args:tests}'
test-cov = 'python -m pytest -n auto -p no:briefcase --cov-report=term-missing --cov-config=pyproject.toml --cov=src/vidkompy --cov=tests {args:tests}'
bench = 'python -m pytest -v -p no:briefcase tests/test_benchmark.py --benchmark-only'
bench-save = 'python -m pytest -v -p no:briefcase tests/test_benchmark.py --benchmark-only --benchmark-json=benchmark/results.json'

[tool.hatch.version]
source = 'vcs'

[tool.hatch.version.raw-options]
version_scheme = 'post-release'

[tool.mypy]
python_version = '3.10'
warn_return_any = true
warn_unused_configs = true
disallow_untyped_defs = true
disallow_incomplete_defs = true
check_untyped_defs = true
disallow_untyped_decorators = true
no_implicit_optional = true
warn_redundant_casts = true
warn_unused_ignores = true
warn_no_return = true
warn_unreachable = true

[tool.ruff]
target-version = 'py310'
line-length = 88

[tool.ruff.lint]
extend-select = [
    'A',
    'ARG',
    'B',
    'C',
    'DTZ',
    'E',
    'EM',
    'F',
    'FBT',
    'I',
    'ICN',
    'ISC',
    'N',
    'PLC',
    'PLE',
    'PLR',
    'PLW',
    'Q',
    'RUF',
    'S',
    'T',
    'TID',
    'UP',
    'W',
    'YTT',
]
ignore = [
    'ARG001',
    'E501',
    'I001',
    'RUF001',
    'PLR2004',
    'EXE003',
    'ISC001',
]

[tool.ruff.per-file-ignores]
"tests/*" = ['S101']
[tool.pytest.ini_options]
addopts = '-v --durations=10 -p no:briefcase'
asyncio_mode = 'auto'
console_output_style = 'progress'
filterwarnings = [
    'ignore::DeprecationWarning',
    'ignore::UserWarning',
]
log_cli = true
log_cli_level = 'INFO'
markers = [
    '''benchmark: marks tests as benchmarks (select with '-m benchmark')''',
    'unit: mark a test as a unit test',
    'integration: mark a test as an integration test',
    'permutation: tests for permutation functionality',
    'parameter: tests for parameter parsing',
    'prompt: tests for prompt parsing',
]
norecursedirs = [
    '.*',
    'build',
    'dist',
    'venv',
    '__pycache__',
    '*.egg-info',
    '_private',
]
python_classes = ['Test*']
python_files = ['test_*.py']
python_functions = ['test_*']
testpaths = ['tests']

[tool.pytest-benchmark]
min_rounds = 100
min_time = 0.1
histogram = true
storage = 'file'
save-data = true
compare = [
    'min',
    'max',
    'mean',
    'stddev',
    'median',
    'iqr',
    'ops',
    'rounds',
]
</file>

<file path="src/vidkompy/core/dtw_aligner.py">
#!/usr/bin/env python3
# this_file: src/vidkompy/core/dtw_aligner.py

"""
Dynamic Time Warping (DTW) for video frame alignment.

This module implements DTW algorithm for finding the globally optimal
monotonic alignment between two video sequences, preventing temporal artifacts.
"""

import numpy as np
from collections.abc import Callable
from loguru import logger
from rich.progress import Progress, TextColumn, BarColumn, TimeRemainingColumn
from rich.console import Console

from vidkompy.models import FrameAlignment
from vidkompy.core.numba_optimizations import (
    compute_dtw_cost_matrix_numba,
    find_dtw_path_numba,
    prepare_fingerprints_for_numba,
    log_numba_compilation,
)

console = Console()


class DTWAligner:
    """Dynamic Time Warping for video frame alignment.

    Why DTW over current greedy matching:
    - Guarantees monotonic alignment (no backward jumps)
    - Finds globally optimal path, not just local matches
    - Handles speed variations naturally
    - Proven algorithm from speech/time series analysis

    Why Sakoe-Chiba band constraint:
    - Reduces complexity from O(N²) to O(N×window)
    - Prevents extreme time warping
    - Makes algorithm practical for long videos
    """

    def __init__(self, window: int = 100):
        """Initialize DTW aligner with constraints.

        Args:
            window: Maximum deviation from diagonal path
                              (Sakoe-Chiba band width). Set to 0 to use default.
        """
        self.window = window
        self.default_window = 100
        self.use_numba = True  # Flag to enable/disable numba optimizations
        self._numba_compiled = False

    def set_window(self, window: int):
        """Set the window constraint for DTW alignment.

        Args:
            window: Window size for sliding frame matching. 0 means use default.
        """
        if window > 0:
            self.window = window
        else:
            self.window = self.default_window

    def align_videos(
        self,
        fg_fingerprints: dict[int, dict[str, np.ndarray]],
        bg_fingerprints: dict[int, dict[str, np.ndarray]],
        fingerprint_compare_func: Callable,
        show_progress: bool = True,
    ) -> list[tuple[int, int, float]]:
        """Find optimal monotonic alignment using DTW.

        Args:
            fg_fingerprints: Foreground video fingerprints {frame_idx: fingerprint}
            bg_fingerprints: Background video fingerprints {frame_idx: fingerprint}
            fingerprint_compare_func: Function to compare two fingerprints (0-1 similarity)
            show_progress: Whether to show progress bar

        Returns:
            List of (bg_idx, fg_idx, confidence) tuples representing optimal alignment

        Why this approach:
        - Works on pre-computed fingerprints for speed
        - Returns confidence scores for quality assessment
        - Maintains fg frame order (as required)
        """
        fg_indices = sorted(fg_fingerprints.keys())
        bg_indices = sorted(bg_fingerprints.keys())

        n_fg = len(fg_indices)
        n_bg = len(bg_indices)

        logger.info(
            f"Starting DTW alignment: {n_fg} fg frames × {n_bg} bg frames, "
            f"window={self.window}"
        )

        # Build DTW cost matrix
        dtw_matrix = self._build_dtw_matrix(
            fg_indices,
            bg_indices,
            fg_fingerprints,
            bg_fingerprints,
            fingerprint_compare_func,
            show_progress,
        )

        # Find optimal path
        path = self._find_optimal_path(dtw_matrix, n_fg, n_bg)

        # Convert path to frame alignments with confidence scores
        alignments = self._path_to_alignments(
            path,
            fg_indices,
            bg_indices,
            fg_fingerprints,
            bg_fingerprints,
            fingerprint_compare_func,
        )

        logger.info(f"DTW completed: {len(alignments)} frame alignments")

        return alignments

    def _build_dtw_matrix(
        self,
        fg_indices: list[int],
        bg_indices: list[int],
        fg_fingerprints: dict[int, dict[str, np.ndarray]],
        bg_fingerprints: dict[int, dict[str, np.ndarray]],
        compare_func: Callable,
        show_progress: bool,
    ) -> np.ndarray:
        """Build DTW cost matrix with Sakoe-Chiba band constraint.

        Why band constraint:
        - Prevents extreme time warping
        - Reduces computation from O(N²) to O(N×window)
        - Enforces reasonable temporal alignment
        """
        n_fg = len(fg_indices)
        n_bg = len(bg_indices)

        # Try to use Numba optimization if available
        if self.use_numba and n_fg > 10 and n_bg > 10:  # Only use for non-trivial sizes
            try:
                if not self._numba_compiled:
                    log_numba_compilation()
                    self._numba_compiled = True
                
                # Convert fingerprints to feature arrays for numba
                fg_features = self._fingerprints_to_features(fg_fingerprints, fg_indices)
                bg_features = self._fingerprints_to_features(bg_fingerprints, bg_indices)
                
                if show_progress:
                    console.print("  Using Numba-optimized DTW computation...")
                
                # Use numba-optimized version
                dtw = compute_dtw_cost_matrix_numba(fg_features, bg_features, self.window)
                return dtw
                
            except Exception as e:
                logger.warning(f"Failed to use Numba optimization: {e}")
                logger.info("Falling back to standard implementation")
        
        # Standard implementation
        # Initialize with infinity
        dtw = np.full((n_fg + 1, n_bg + 1), np.inf)
        dtw[0, 0] = 0

        # Progress tracking
        if show_progress:
            progress = Progress(
                TextColumn("[progress.description]{task.description}"),
                BarColumn(),
                TimeRemainingColumn(),
                console=console,
                transient=True,
            )
            task = progress.add_task("  Building DTW matrix...", total=n_fg)
            progress.start()

        # Fill DTW matrix with band constraint
        for i in range(1, n_fg + 1):
            # Sakoe-Chiba band: only compute within window of diagonal
            j_start = max(1, i - self.window)
            j_end = min(n_bg + 1, i + self.window)

            for j in range(j_start, j_end):
                # Get fingerprints
                fg_fp = fg_fingerprints[fg_indices[i - 1]]
                bg_fp = bg_fingerprints[bg_indices[j - 1]]

                # Compute cost (1 - similarity)
                similarity = compare_func(fg_fp, bg_fp)
                cost = 1.0 - similarity

                # DTW recursion: min of three possible paths
                dtw[i, j] = cost + min(
                    dtw[i - 1, j],  # Skip bg frame (insertion)
                    dtw[i, j - 1],  # Skip fg frame (deletion)
                    dtw[i - 1, j - 1],  # Match frames
                )

            if show_progress:
                progress.update(task, advance=1)

        if show_progress:
            progress.stop()

        return dtw

    def _find_optimal_path(
        self, dtw: np.ndarray, n_fg: int, n_bg: int
    ) -> list[tuple[int, int]]:
        """Backtrack through DTW matrix to find optimal path.

        Why backtracking:
        - Recovers the actual alignment from cost matrix
        - Guarantees monotonic path
        - Handles insertions/deletions/matches
        """
        # Try to use Numba optimization if available
        if self.use_numba and n_fg > 10 and n_bg > 10:
            try:
                # Use numba-optimized version
                path_array = find_dtw_path_numba(dtw)
                # Convert to list of tuples
                path = [(int(i), int(j)) for i, j in path_array]
                return path
            except Exception as e:
                logger.warning(f"Failed to use Numba path finding: {e}")
                logger.info("Falling back to standard implementation")
        
        # Standard implementation
        path = []
        i, j = n_fg, n_bg

        # Backtrack from end to start
        while i > 0 or j > 0:
            path.append((i, j))

            if i == 0:
                j -= 1
            elif j == 0:
                i -= 1
            else:
                # Choose direction with minimum cost
                costs = [
                    (i - 1, j, dtw[i - 1, j]),  # From above
                    (i, j - 1, dtw[i, j - 1]),  # From left
                    (i - 1, j - 1, dtw[i - 1, j - 1]),  # From diagonal
                ]

                # Filter out invalid positions
                valid_costs = [
                    (pi, pj, cost) for pi, pj, cost in costs if cost != np.inf
                ]

                if valid_costs:
                    i, j, _ = min(valid_costs, key=lambda x: x[2])
                else:
                    # Fallback: move diagonally
                    i, j = i - 1, j - 1

        # Reverse to get forward path
        path.reverse()

        # Remove dummy start position
        if path and path[0] == (0, 0):
            path = path[1:]

        return path

    def _path_to_alignments(
        self,
        path: list[tuple[int, int]],
        fg_indices: list[int],
        bg_indices: list[int],
        fg_fingerprints: dict[int, dict[str, np.ndarray]],
        bg_fingerprints: dict[int, dict[str, np.ndarray]],
        compare_func: Callable,
    ) -> list[tuple[int, int, float]]:
        """Convert DTW path to frame alignments with confidence scores.

        Why confidence scores:
        - Helps identify problematic alignments
        - Enables quality-based filtering
        - Provides feedback for debugging
        """
        alignments = []

        for i, j in path:
            # Skip boundary positions
            if i == 0 or j == 0:
                continue

            # Get actual frame indices
            fg_idx = fg_indices[i - 1]
            bg_idx = bg_indices[j - 1]

            # Compute confidence as similarity score
            fg_fp = fg_fingerprints[fg_idx]
            bg_fp = bg_fingerprints[bg_idx]
            confidence = compare_func(fg_fp, bg_fp)

            alignments.append((bg_idx, fg_idx, confidence))

        # Remove duplicate fg frames (keep best match)
        unique_alignments = {}
        for bg_idx, fg_idx, conf in alignments:
            if fg_idx not in unique_alignments or conf > unique_alignments[fg_idx][1]:
                unique_alignments[fg_idx] = (bg_idx, conf)

        # Convert back to list format
        final_alignments = [
            (bg_idx, fg_idx, conf)
            for fg_idx, (bg_idx, conf) in sorted(unique_alignments.items())
        ]

        return final_alignments

    def create_frame_alignments(
        self,
        dtw_matches: list[tuple[int, int, float]],
        total_fg_frames: int,
        total_bg_frames: int,
    ) -> list[FrameAlignment]:
        """Create complete frame-to-frame alignment from DTW matches.

        Args:
            dtw_matches: List of (bg_idx, fg_idx, confidence) from DTW
            total_fg_frames: Total number of foreground frames
            total_bg_frames: Total number of background frames

        Returns:
            List of FrameAlignment objects for every fg frame

        Why interpolation:
        - DTW may skip some frames
        - Need alignment for EVERY fg frame
        - Smooth interpolation prevents jumps
        """
        if not dtw_matches:
            # Fallback to simple linear mapping
            return self._create_linear_alignment(total_fg_frames, total_bg_frames)

        # Sort by fg index
        dtw_matches.sort(key=lambda x: x[1])

        # Create alignment for every fg frame
        alignments = []

        for fg_idx in range(total_fg_frames):
            # Find surrounding DTW matches
            prev_match = None
            next_match = None

            for match in dtw_matches:
                if match[1] <= fg_idx:
                    prev_match = match
                elif match[1] > fg_idx and next_match is None:
                    next_match = match
                    break

            # Interpolate or extrapolate
            if prev_match is None and next_match is None:
                # No matches at all - use linear
                bg_idx = int(fg_idx * total_bg_frames / total_fg_frames)
                confidence = 0.5
            elif prev_match is None:
                # Before first match - extrapolate
                bg_idx = max(0, next_match[0] - (next_match[1] - fg_idx))
                confidence = next_match[2] * 0.8
            elif next_match is None:
                # After last match - extrapolate
                bg_idx = min(
                    total_bg_frames - 1, prev_match[0] + (fg_idx - prev_match[1])
                )
                confidence = prev_match[2] * 0.8
            # Between matches - interpolate
            elif prev_match[1] == next_match[1]:
                # Same fg frame
                bg_idx = prev_match[0]
                confidence = prev_match[2]
            else:
                # Linear interpolation
                ratio = (fg_idx - prev_match[1]) / (next_match[1] - prev_match[1])
                bg_idx = int(prev_match[0] + ratio * (next_match[0] - prev_match[0]))
                confidence = prev_match[2] * (1 - ratio) + next_match[2] * ratio

            # Ensure valid bg index
            bg_idx = max(0, min(bg_idx, total_bg_frames - 1))

            alignments.append(
                FrameAlignment(
                    fg_frame_idx=fg_idx,
                    bg_frame_idx=bg_idx,
                    similarity_score=confidence,
                )
            )

        return alignments

    def _create_linear_alignment(
        self, total_fg_frames: int, total_bg_frames: int
    ) -> list[FrameAlignment]:
        """Create simple linear frame mapping as fallback.

        Why we need fallback:
        - DTW might fail on very dissimilar videos
        - Better than no alignment at all
        - Maintains temporal order
        """
        ratio = total_bg_frames / total_fg_frames if total_fg_frames > 0 else 1.0

        alignments = []
        for fg_idx in range(total_fg_frames):
            bg_idx = min(int(fg_idx * ratio), total_bg_frames - 1)

            alignments.append(
                FrameAlignment(
                    fg_frame_idx=fg_idx,
                    bg_frame_idx=bg_idx,
                    similarity_score=0.5,  # Unknown confidence
                )
            )

        return alignments

    def _compute_cost_matrix(
        self, fg_features: np.ndarray, bg_features: np.ndarray
    ) -> np.ndarray:
        """Compute cost matrix for DTW from feature arrays.

        Args:
            fg_features: Foreground feature vectors (N, D)
            bg_features: Background feature vectors (M, D)

        Returns:
            Cost matrix (N, M)
        """
        n_fg = len(fg_features)
        n_bg = len(bg_features)

        # Initialize cost matrix
        cost_matrix = np.zeros((n_fg, n_bg))

        # Compute pairwise distances
        for i in range(n_fg):
            for j in range(n_bg):
                # Euclidean distance between feature vectors
                cost_matrix[i, j] = np.linalg.norm(fg_features[i] - bg_features[j])

        # Normalize costs to [0, 1]
        if cost_matrix.max() > 0:
            cost_matrix = cost_matrix / cost_matrix.max()

        return cost_matrix

    def _compute_path(self, cost_matrix: np.ndarray) -> list[tuple[int, int]]:
        """Compute optimal DTW path from cost matrix.

        Args:
            cost_matrix: Cost matrix (N, M)

        Returns:
            List of (i, j) indices representing the optimal path
        """
        n_fg, n_bg = cost_matrix.shape

        # Initialize DTW matrix
        dtw = np.full((n_fg + 1, n_bg + 1), np.inf)
        dtw[0, 0] = 0

        # Fill DTW matrix with window constraint
        for i in range(1, n_fg + 1):
            # Sakoe-Chiba band
            j_start = max(1, i - self.window)
            j_end = min(n_bg + 1, i + self.window)

            for j in range(j_start, j_end):
                cost = cost_matrix[i - 1, j - 1]

                # DTW recursion
                dtw[i, j] = cost + min(
                    dtw[i - 1, j],  # Insertion
                    dtw[i, j - 1],  # Deletion
                    dtw[i - 1, j - 1],  # Match
                )

        # Backtrack to find path
        path = []
        i, j = n_fg, n_bg

        while i > 0 and j > 0:
            path.append((i - 1, j - 1))  # Convert to 0-based indices

            # Choose direction with minimum cost
            if i == 1:
                j -= 1
            elif j == 1:
                i -= 1
            else:
                costs = [
                    dtw[i - 1, j],  # From above
                    dtw[i, j - 1],  # From left
                    dtw[i - 1, j - 1],  # From diagonal
                ]

                min_idx = np.argmin(costs)
                if min_idx == 0:
                    i -= 1
                elif min_idx == 1:
                    j -= 1
                else:
                    i -= 1
                    j -= 1

        # Add remaining path to origin
        while i > 0:
            i -= 1
            path.append((i, 0))
        while j > 0:
            j -= 1
            path.append((0, j))

        path.reverse()

        # Remove any invalid entries
        path = [(i, j) for i, j in path if i >= 0 and j >= 0]

        return path

    def _fingerprints_to_features(
        self, fingerprints: dict[int, dict[str, np.ndarray]], indices: list[int]
    ) -> np.ndarray:
        """Convert fingerprints to feature matrix for Numba processing.
        
        Args:
            fingerprints: Dictionary of fingerprints
            indices: List of frame indices
            
        Returns:
            Feature matrix where each row is a flattened fingerprint
        """
        # Get sample fingerprint to determine feature size
        sample_fp = next(iter(fingerprints.values()))
        
        # Calculate total feature size
        feature_size = 0
        for key, value in sample_fp.items():
            if key == "histogram":
                feature_size += len(value)
            else:
                # Hash values
                feature_size += value.size
        
        # Build feature matrix
        features = np.zeros((len(indices), feature_size), dtype=np.float32)
        
        for i, idx in enumerate(indices):
            fp = fingerprints[idx]
            offset = 0
            
            # Add hash features
            for key in sorted(fp.keys()):
                if key == "histogram":
                    continue
                value = fp[key].flatten()
                features[i, offset:offset + len(value)] = value.astype(np.float32)
                offset += len(value)
            
            # Add histogram at the end
            if "histogram" in fp:
                hist = fp["histogram"]
                features[i, offset:offset + len(hist)] = hist
        
        return features
</file>

<file path="src/vidkompy/core/video_processor.py">
#!/usr/bin/env python3
# this_file: src/vidkompy/core/video_processor.py

"""
Core video processing functionality.

Handles video I/O, metadata extraction, and frame operations.
"""

import cv2
import ffmpeg
import numpy as np
from pathlib import Path
from loguru import logger
from rich.progress import (
    Progress,
    TextColumn,
    BarColumn,
    TimeRemainingColumn,
)
from rich.console import Console

from vidkompy.models import VideoInfo

console = Console()


class VideoProcessor:
    """Handles core video processing operations.

    This module provides the foundation for all video I/O operations.
    It abstracts away the complexity of video codecs and formats.

    Why separate video processing:
    - Isolates platform-specific code
    - Makes testing easier (can mock video I/O)
    - Single place for optimization
    - Handles codec compatibility issues

    Why both OpenCV and FFmpeg:
    - OpenCV: Frame-accurate reading, computer vision operations
    - FFmpeg: Audio handling, codec support, fast encoding
    """

    def get_video_info(self, video_path: str) -> VideoInfo:
        """Extract video metadata using ffprobe.

        Why ffprobe instead of OpenCV:
        - More reliable metadata extraction
        - Handles all video formats
        - Provides accurate duration/framerate
        - Detects audio streams properly

        Why we need this info:
        - FPS determines temporal alignment strategy
        - Resolution needed for spatial alignment
        - Duration for progress estimation
        - Audio presence for alignment method selection

        Args:
            video_path: Path to video file

        Returns:
            VideoInfo object with metadata

        Raises:
            ValueError: If video cannot be probed
        """
        logger.debug(f"Probing video: {video_path}")

        try:
            probe = ffmpeg.probe(video_path)

            # Find video stream
            video_stream = next(
                (s for s in probe["streams"] if s["codec_type"] == "video"), None
            )

            if not video_stream:
                msg = f"No video stream found in {video_path}"
                raise ValueError(msg)

            # Extract properties
            width = int(video_stream["width"])
            height = int(video_stream["height"])

            # Parse frame rate
            fps_str = video_stream.get("r_frame_rate", "0/1")
            if "/" in fps_str:
                num, den = map(int, fps_str.split("/"))
                fps = num / den if den != 0 else 0
            else:
                fps = float(fps_str)

            duration = float(probe["format"].get("duration", 0))

            # Calculate frame count
            frame_count = int(video_stream.get("nb_frames", 0))
            if frame_count == 0 and duration > 0 and fps > 0:
                frame_count = int(duration * fps)

            # Check audio
            audio_stream = next(
                (s for s in probe["streams"] if s["codec_type"] == "audio"), None
            )

            has_audio = audio_stream is not None
            audio_sample_rate = None
            audio_channels = None

            if audio_stream:
                audio_sample_rate = int(audio_stream.get("sample_rate", 0))
                audio_channels = int(audio_stream.get("channels", 0))

            info = VideoInfo(
                width=width,
                height=height,
                fps=fps,
                duration=duration,
                frame_count=frame_count,
                has_audio=has_audio,
                audio_sample_rate=audio_sample_rate,
                audio_channels=audio_channels,
                path=video_path,
            )

            logger.info(
                f"Video info for {Path(video_path).name}: "
                f"{width}x{height}, {fps:.2f} fps, {duration:.2f}s, "
                f"{frame_count} frames, audio: {'yes' if has_audio else 'no'}"
            )

            return info

        except Exception as e:
            logger.error(f"Failed to probe video {video_path}: {e}")
            raise

    def extract_frames(
        self, video_path: str, frame_indices: list[int], resize_factor: float = 1.0
    ) -> list[np.ndarray]:
        """Extract specific frames from video.

        Why selective frame extraction:
        - Loading full video would exhaust memory
        - We only need specific frames for matching
        - Random access is fast with modern codecs

        Why resize option:
        - Faster processing on smaller frames
        - SSIM works fine at lower resolution
        - Reduces memory usage significantly

        Why progress bar for large extractions:
        - Frame seeking can be slow on some codecs
        - Users need feedback for long operations
        - Helps identify performance issues

        Args:
            video_path: Path to video file
            frame_indices: List of frame indices to extract
            resize_factor: Factor to resize frames (for performance)

        Returns:
            List of frames as numpy arrays
        """
        frames = []
        cap = cv2.VideoCapture(video_path)

        if not cap.isOpened():
            logger.error(f"Failed to open video: {video_path}")
            return frames

        try:
            # Only show progress for large frame extractions
            if len(frame_indices) > 50:
                with Progress(
                    TextColumn("[progress.description]{task.description}"),
                    BarColumn(),
                    TimeRemainingColumn(),
                    console=console,
                    transient=True,
                ) as progress:
                    task = progress.add_task(
                        f"    Extracting {len(frame_indices)} frames...",
                        total=len(frame_indices),
                    )

                    for idx in frame_indices:
                        cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
                        ret, frame = cap.read()

                        if ret:
                            if resize_factor != 1.0:
                                height, width = frame.shape[:2]
                                new_width = int(width * resize_factor)
                                new_height = int(height * resize_factor)
                                frame = cv2.resize(frame, (new_width, new_height))
                            frames.append(frame)
                        else:
                            logger.warning(
                                f"Failed to read frame {idx} from {video_path}"
                            )

                        progress.update(task, advance=1)
            else:
                # No progress bar for small extractions
                for idx in frame_indices:
                    cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
                    ret, frame = cap.read()

                    if ret:
                        if resize_factor != 1.0:
                            height, width = frame.shape[:2]
                            new_width = int(width * resize_factor)
                            new_height = int(height * resize_factor)
                            frame = cv2.resize(frame, (new_width, new_height))
                        frames.append(frame)
                    else:
                        logger.warning(f"Failed to read frame {idx} from {video_path}")

        finally:
            cap.release()

        return frames

    def extract_frame_range(
        self,
        video_path: str,
        start_frame: int,
        end_frame: int,
        step: int = 1,
        resize_factor: float = 1.0,
    ) -> list[tuple[int, np.ndarray]]:
        """Extract a range of frames with their indices.

        Args:
            video_path: Path to video
            start_frame: Starting frame index
            end_frame: Ending frame index (exclusive)
            step: Frame step size
            resize_factor: Resize factor for frames

        Returns:
            List of (frame_index, frame) tuples
        """
        frames = []
        cap = cv2.VideoCapture(video_path)

        if not cap.isOpened():
            logger.error(f"Failed to open video: {video_path}")
            return frames

        try:
            for idx in range(start_frame, end_frame, step):
                cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
                ret, frame = cap.read()

                if ret:
                    if resize_factor != 1.0:
                        height, width = frame.shape[:2]
                        new_width = int(width * resize_factor)
                        new_height = int(height * resize_factor)
                        frame = cv2.resize(frame, (new_width, new_height))
                    frames.append((idx, frame))
                else:
                    break

        finally:
            cap.release()

        return frames

    def create_video_writer(
        self, output_path: str, width: int, height: int, fps: float, codec: str = "mp4v"
    ) -> cv2.VideoWriter:
        """Create OpenCV video writer.

        Why H.264/mp4v codec:
        - Universal compatibility
        - Good compression ratio
        - Hardware acceleration available
        - Supports high resolutions

        Why we write silent video first:
        - OpenCV VideoWriter doesn't handle audio
        - Gives us perfect frame control
        - Audio added later with FFmpeg

        Args:
            output_path: Output video path
            width: Video width
            height: Video height
            fps: Frame rate
            codec: Video codec (default mp4v)

        Returns:
            VideoWriter object
        """
        fourcc = cv2.VideoWriter_fourcc(*codec)
        writer = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

        if not writer.isOpened():
            msg = f"Failed to create video writer for {output_path}"
            raise ValueError(msg)

        return writer
    
    def extract_all_frames(
        self, video_path: str, resize_factor: float = 1.0, crop: tuple[int, int, int, int] | None = None
    ) -> np.ndarray | None:
        """Extract all frames from video as a numpy array.
        
        This is used by the precise alignment engine which needs
        access to all frames for multi-resolution processing.
        
        Args:
            video_path: Path to video file
            resize_factor: Factor to resize frames (for performance)
            crop: Optional (x, y, width, height) tuple to crop frames
            
        Returns:
            Array of frames or None if extraction fails
        """
        cap = cv2.VideoCapture(video_path)
        
        if not cap.isOpened():
            logger.error(f"Failed to open video: {video_path}")
            return None
        
        try:
            frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
            fps = cap.get(cv2.CAP_PROP_FPS)
            width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
            height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
            
            # Calculate resized dimensions
            if resize_factor != 1.0:
                new_width = int(width * resize_factor)
                new_height = int(height * resize_factor)
            else:
                new_width, new_height = width, height
            
            # Apply crop dimensions if specified
            if crop:
                crop_x, crop_y, crop_w, crop_h = crop
                # Scale crop coordinates by resize factor
                crop_x = int(crop_x * resize_factor)
                crop_y = int(crop_y * resize_factor)
                crop_w = int(crop_w * resize_factor)
                crop_h = int(crop_h * resize_factor)
                
                # Ensure crop is within bounds
                crop_x = max(0, min(crop_x, new_width - 1))
                crop_y = max(0, min(crop_y, new_height - 1))
                crop_w = min(crop_w, new_width - crop_x)
                crop_h = min(crop_h, new_height - crop_y)
                
                final_width = crop_w
                final_height = crop_h
            else:
                final_width = new_width
                final_height = new_height
            
            # Pre-allocate array for efficiency
            frames = np.zeros((frame_count, final_height, final_width, 3), dtype=np.uint8)
            
            if crop:
                logger.info(f"Extracting all {frame_count} frames at {new_width}x{new_height}, cropped to {final_width}x{final_height}")
            else:
                logger.info(f"Extracting all {frame_count} frames at {new_width}x{new_height}")
            
            # Extract frames with progress bar
            with Progress(
                TextColumn("[progress.description]{task.description}"),
                BarColumn(),
                TimeRemainingColumn(),
                console=console,
                transient=True,
            ) as progress:
                task = progress.add_task(
                    f"    Extracting {frame_count} frames...",
                    total=frame_count,
                )
                
                frame_idx = 0
                while True:
                    ret, frame = cap.read()
                    if not ret:
                        break
                    
                    if resize_factor != 1.0:
                        frame = cv2.resize(frame, (new_width, new_height))
                    
                    # Apply cropping if specified
                    if crop:
                        frame = frame[crop_y:crop_y+crop_h, crop_x:crop_x+crop_w]
                    
                    if frame_idx < frame_count:
                        frames[frame_idx] = frame
                        frame_idx += 1
                        progress.update(task, advance=1)
                    else:
                        logger.warning(f"More frames than expected in {video_path}")
                        break
            
            # Trim array if we got fewer frames than expected
            if frame_idx < frame_count:
                logger.warning(f"Got {frame_idx} frames, expected {frame_count}")
                frames = frames[:frame_idx]
            
            logger.info(f"Extracted {len(frames)} frames from {Path(video_path).name}")
            return frames
            
        except Exception as e:
            logger.error(f"Failed to extract frames from {video_path}: {e}")
            return None
        finally:
            cap.release()
</file>

<file path="SPEC.md">
# Implementation Plan: Tunnel-Based Temporal Alignment

## Overview

The user has proposed a new approach to temporal alignment that avoids perceptual hashing/fingerprinting and instead uses direct frame comparison with sliding windows. This approach will be implemented as two new temporal alignment engines:

1. **tunnel_full**: Uses full frame comparison
2. **tunnel_mask**: Uses masked frame comparison (focusing on the overlapping region)

## Key Concept

Instead of pre-computing fingerprints for all frames, this approach:
- Iterates through each foreground (FG) frame sequentially
- Uses a sliding window in the background (BG) video to find the best match
- Performs matching from both ends (start and end) of the video
- Enforces monotonicity by constraining the search window to frames after the previous match

## Implementation Details

### 1. Tunnel Full Engine (`tunnel_full`)

#### Algorithm:
1. **Forward Pass (from start)**:
   - Start with FG frame 0
   - Search within a window of BG frames (e.g., frames 0-30) for the best match
   - Best match = minimum pixel difference between FG frame and BG frame (cropped to FG region)
   - Once found, move to FG frame 1
   - Constrain BG search window to start from the matched BG frame index + 1
   - Continue until all FG frames are matched

2. **Backward Pass (from end)**:
   - Start with last FG frame
   - Search within a window of BG frames near the end for the best match
   - Once found, move to previous FG frame
   - Constrain BG search window to end before the matched BG frame index - 1
   - Continue backward until all FG frames are matched

3. **Merge Strategy**:
   - Combine forward and backward mappings
   - Use confidence scores based on match quality
   - Apply smoothing to eliminate any remaining discontinuities

#### Frame Comparison Method:
```python
def compute_frame_difference(fg_frame, bg_frame, x_offset, y_offset):
    # Crop BG frame to FG region
    bg_cropped = bg_frame[y_offset:y_offset+fg_height, x_offset:x_offset+fg_width]
    
    # Compute pixel-wise difference
    diff = np.abs(fg_frame.astype(float) - bg_cropped.astype(float))
    
    # Return mean absolute difference
    return np.mean(diff)
```

### 2. Tunnel Mask Engine (`tunnel_mask`)

Similar to `tunnel_full` but with an additional masking step:

#### Masking Strategy:
1. Create a binary mask identifying the actual content region in FG frames (non-black areas)
2. Apply this mask during frame comparison to focus only on content regions
3. This helps when FG video has letterboxing or pillarboxing

#### Modified Frame Comparison:
```python
def compute_masked_frame_difference(fg_frame, bg_frame, x_offset, y_offset, mask):
    # Crop BG frame to FG region
    bg_cropped = bg_frame[y_offset:y_offset+fg_height, x_offset:x_offset+fg_width]
    
    # Apply mask to both frames
    fg_masked = fg_frame * mask
    bg_masked = bg_cropped * mask
    
    # Compute pixel-wise difference only in masked region
    diff = np.abs(fg_masked.astype(float) - bg_masked.astype(float))
    
    # Return mean absolute difference normalized by mask area
    return np.sum(diff) / np.sum(mask)
```

## Implementation Steps

### Step 1: Create Base Tunnel Aligner Class
Create `src/vidkompy/core/tunnel_aligner.py`:
- Base class with common functionality
- Window management logic
- Monotonicity enforcement
- Forward/backward pass framework

### Step 2: Implement TunnelFullAligner
Create subclass in same file:
- Implement `compute_frame_difference` method
- Handle full frame comparison logic
- Optimize with early stopping when good match found

### Step 3: Implement TunnelMaskAligner
Create subclass in same file:
- Implement mask generation logic
- Implement `compute_masked_frame_difference` method
- Handle edge cases where mask might be empty

### Step 4: Integration
Update `src/vidkompy/core/alignment_engine.py`:
- Add `tunnel_full` and `tunnel_mask` to engine registry
- Create appropriate configuration classes
- Wire up CLI parameters

### Step 5: Optimization Considerations
1. **Frame Reading**: Use sequential reading where possible
2. **Downsampling**: Option to downsample frames for faster comparison
3. **Early Stopping**: Stop searching when difference is below threshold
4. **Parallel Processing**: Process forward and backward passes in parallel
5. **Caching**: Cache recently read frames to avoid re-reading

## Configuration Parameters

### Common Parameters:
- `window_size`: Size of the sliding window (default: 30 frames)
- `downsample_factor`: Factor to downsample frames (default: 1.0 = no downsampling)
- `early_stop_threshold`: Stop searching if difference below this (default: 0.05)
- `merge_strategy`: How to combine forward/backward passes ("average", "confidence_weighted")

### Tunnel Mask Specific:
- `mask_threshold`: Threshold for creating binary mask (default: 10/255)
- `mask_erosion`: Pixels to erode mask to avoid edge artifacts (default: 2)

## Expected Benefits

1. **Direct Comparison**: No information loss from fingerprinting
2. **Adaptive**: Can find matches even with compression artifacts
3. **Monotonic by Design**: The sliding window constraint ensures monotonicity
4. **Bidirectional**: Matching from both ends improves robustness
5. **Content-Aware**: Mask mode focuses on actual content, ignoring borders

## Potential Challenges

1. **Performance**: Direct pixel comparison is slower than fingerprint matching
2. **Memory**: May need to keep more frames in memory
3. **Noise Sensitivity**: Pixel-wise comparison sensitive to compression/noise
4. **Window Size**: Too small = might miss correct match; too large = slow

## Mitigation Strategies

1. Use downsampling for initial coarse matching, then refine
2. Implement efficient frame caching
3. Add noise-robust comparison metrics (e.g., SSIM as alternative)
4. Make window size adaptive based on initial probe
</file>

<file path="src/vidkompy/core/alignment_engine.py">
#!/usr/bin/env python3
# this_file: src/vidkompy/core/alignment_engine.py

"""
Main alignment engine that coordinates spatial and temporal alignment.

This is the high-level orchestrator that manages the complete video
overlay process.
"""

import tempfile
import cv2
import ffmpeg
import numpy as np
from pathlib import Path
from loguru import logger
from rich.progress import (
    Progress,
    BarColumn,
    TextColumn,
    TimeRemainingColumn,
)
from rich.console import Console

from vidkompy.models import (
    VideoInfo,
    MatchTimeMode,
    TemporalMethod,
    SpatialAlignment,
    TemporalAlignment,
    FrameAlignment,
)
from .video_processor import VideoProcessor
from .spatial_alignment import SpatialAligner
from .temporal_alignment import TemporalAligner


console = Console()


class AlignmentEngine:
    """Orchestrates the complete video alignment and overlay process.

    This is the main coordinator that manages the entire video overlay workflow.
    It handles the high-level process flow while delegating specific tasks to
    specialized components.

    Why this architecture:
    - Separation of concerns: Each component (spatial, temporal, processing) has a single responsibility
    - Flexibility: Easy to swap alignment algorithms or add new methods
    - Testability: Each component can be tested independently
    - Progress tracking: Centralized progress reporting for better UX
    """

    def __init__(
        self,
        processor: VideoProcessor,
        verbose: bool = False,
        max_keyframes: int = 2000,
        engine_mode: str = "fast",
        drift_interval: int = 100,
        window: int = 100,
    ):
        """Initialize alignment engine.

        Args:
            processor: Video processor instance
            verbose: Enable verbose logging
            max_keyframes: Maximum keyframes for frame matching
            engine_mode: The chosen engine ('fast', 'precise', 'mask')
            drift_interval: Frame interval for drift correction in precise engine
            window: DTW window size
        """
        self.processor = processor
        self.spatial_aligner = SpatialAligner()
        self.temporal_aligner = TemporalAligner(
            processor=processor,
            max_keyframes=max_keyframes,
            drift_interval=drift_interval,
            window=window,
            engine_mode=engine_mode,
        )
        self.verbose = verbose
        self.engine_mode = engine_mode

    def process(
        self,
        bg_path: str,
        fg_path: str,
        output_path: str,
        time_mode: MatchTimeMode,
        space_method: str,
        temporal_method: TemporalMethod,
        skip_spatial: bool,
        trim: bool,
        border_thickness: int = 8,
        blend: bool = False,
        window: int = 0,
    ):
        """Process video overlay with alignment.

        Args:
            bg_path: Background video path
            fg_path: Foreground video path
            output_path: Output video path
            time_mode: Temporal alignment mode
            space_method: Spatial alignment method
            temporal_method: Temporal algorithm to use (DTW or classic)
            skip_spatial: Skip spatial alignment
            trim: Trim to overlapping segment
            border_thickness: Border thickness for border matching mode
            blend: Enable smooth blending at frame edges
            window: Sliding window size for frame matching
        """
        # Analyze videos - quick task, use simple logging
        logger.info("Analyzing videos...")
        bg_info = self.processor.get_video_info(bg_path)
        fg_info = self.processor.get_video_info(fg_path)

        # Log compatibility
        self._log_compatibility(bg_info, fg_info)

        # Spatial alignment - quick task, use simple logging
        logger.info("Computing spatial alignment...")
        spatial_alignment = self._compute_spatial_alignment(
            bg_info, fg_info, space_method, skip_spatial
        )

        # Log spatial alignment results in non-verbose mode too
        if not skip_spatial:
            logger.info(
                f"Spatial alignment result: offset=({spatial_alignment.x_offset}, {spatial_alignment.y_offset}), "
                f"scale={spatial_alignment.scale_factor:.3f}, confidence={spatial_alignment.confidence:.3f}"
            )

        # Temporal alignment - potentially time-intensive, use progress tracking
        logger.info("Computing temporal alignment...")
        temporal_alignment = self._compute_temporal_alignment(
            bg_info,
            fg_info,
            time_mode,
            temporal_method,
            trim,
            spatial_alignment,
            border_thickness,
            window,
        )

        # Log temporal alignment results
        logger.info(
            f"Temporal alignment result: method={temporal_alignment.method_used}, "
            f"offset={temporal_alignment.offset_seconds:.3f}s, "
            f"frames={len(temporal_alignment.frame_alignments)}, "
            f"confidence={temporal_alignment.confidence:.3f}"
        )

        # Compose final video - time-intensive, use progress tracking
        logger.info("Composing output video...")
        self._compose_video(
            bg_info,
            fg_info,
            output_path,
            spatial_alignment,
            temporal_alignment,
            trim,
            blend,
            border_thickness,
        )

        logger.info(f"✅ Processing complete: {output_path}")

    def _log_compatibility(self, bg_info: VideoInfo, fg_info: VideoInfo):
        """Log video compatibility information.

        Why we log compatibility:
        - Early warning of potential issues (e.g., fg larger than bg)
        - Helps users understand processing decisions
        - Aids in debugging alignment problems
        - Documents the input characteristics for reproducibility
        """
        logger.info("Video compatibility check:")
        logger.info(
            f"  Resolution: {bg_info.width}x{bg_info.height} vs {fg_info.width}x{fg_info.height}"
        )
        logger.info(f"  FPS: {bg_info.fps:.2f} vs {fg_info.fps:.2f}")
        logger.info(f"  Duration: {bg_info.duration:.2f}s vs {fg_info.duration:.2f}s")
        logger.info(
            f"  Audio: {'yes' if bg_info.has_audio else 'no'} vs {'yes' if fg_info.has_audio else 'no'}"
        )

        if fg_info.width > bg_info.width or fg_info.height > bg_info.height:
            logger.warning(
                "⚠️  Foreground is larger than background - will be scaled down"
            )

    def _compute_spatial_alignment(
        self, bg_info: VideoInfo, fg_info: VideoInfo, method: str, skip: bool
    ) -> SpatialAlignment:
        """Compute spatial alignment using sample frames.

        Why we use middle frames for alignment:
        - Middle frames typically have the main content fully visible
        - Avoids potential black frames or transitions at start/end
        - Single frame is usually sufficient for static camera shots
        - Fast computation while maintaining accuracy

        Why we support skipping:
        - Sometimes users know the alignment (e.g., already centered)
        - Useful for testing temporal alignment independently
        - Speeds up processing when spatial alignment isn't needed
        """
        if skip:
            logger.info("Skipping spatial alignment - centering foreground")
            x_offset = (bg_info.width - fg_info.width) // 2
            y_offset = (bg_info.height - fg_info.height) // 2
            return SpatialAlignment(x_offset, y_offset, 1.0, 1.0)

        # Extract sample frames for alignment
        bg_frames = self.processor.extract_frames(
            bg_info.path, [bg_info.frame_count // 2]
        )
        fg_frames = self.processor.extract_frames(
            fg_info.path, [fg_info.frame_count // 2]
        )

        if not bg_frames or not fg_frames:
            logger.error("Failed to extract frames for spatial alignment")
            x_offset = (bg_info.width - fg_info.width) // 2
            y_offset = (bg_info.height - fg_info.height) // 2
            return SpatialAlignment(x_offset, y_offset, 1.0, 0.0)

        return self.spatial_aligner.align(
            bg_frames[0], fg_frames[0], method, skip_alignment=skip
        )

    def _compute_temporal_alignment(
        self,
        bg_info: VideoInfo,
        fg_info: VideoInfo,
        mode: MatchTimeMode,
        temporal_method: TemporalMethod,
        trim: bool,
        spatial_alignment: SpatialAlignment,
        border_thickness: int,
        window: int,
    ) -> TemporalAlignment:
        """Compute temporal alignment based on mode.

        Why we have multiple modes:
        - BORDER: Border-based matching focusing on frame edges (default)
        - PRECISE: Frame-based matching for maximum accuracy

        Why we always use frame-based methods:
        - Provides precise visual synchronization
        - Works with all videos regardless of audio
        - Handles all edge cases reliably
        """
        # Configure temporal aligner based on method and window
        self.temporal_aligner.use_dtw = temporal_method == TemporalMethod.DTW
        if window > 0:
            self.temporal_aligner.dtw_aligner.set_window(window)

        if mode == MatchTimeMode.BORDER:
            # Use border-based alignment with mask
            logger.info(
                f"Using border-based temporal alignment (border thickness: {border_thickness}px)"
            )
            border_mask = self.temporal_aligner.create_border_mask(
                spatial_alignment, fg_info, bg_info, border_thickness
            )
            return self.temporal_aligner.align_frames_with_mask(
                bg_info, fg_info, trim, border_mask
            )

        elif mode == MatchTimeMode.PRECISE:
            # Always use frame-based alignment
            return self.temporal_aligner.align_frames(bg_info, fg_info, trim)

        else:
            # For any other mode, use frame-based alignment
            return self.temporal_aligner.align_frames(bg_info, fg_info, trim)

    def _compose_video(
        self,
        bg_info: VideoInfo,
        fg_info: VideoInfo,
        output_path: str,
        spatial: SpatialAlignment,
        temporal: TemporalAlignment,
        trim: bool,
        blend: bool = False,
        border_thickness: int = 8,
    ):
        """Compose the final output video.

        Why we use a two-step process (silent video + audio):
        - OpenCV doesn't handle audio, but provides frame-accurate control
        - FFmpeg handles audio well but can have frame accuracy issues
        - Combining both gives us the best of both worlds

        Why we prefer FG audio:
        - FG video is considered "better quality" per requirements
        - FG frames drive the output timing
        - Keeping FG audio maintains sync with FG visuals
        """
        logger.info(f"Composing video with {temporal.method_used} temporal alignment")

        # Use OpenCV for frame-accurate composition
        with tempfile.TemporaryDirectory() as tmpdir:
            # Create silent video first
            temp_video = Path(tmpdir) / "temp_silent.mp4"

            self._compose_with_opencv(
                bg_info,
                fg_info,
                str(temp_video),
                spatial,
                temporal.frame_alignments,
                blend,
                border_thickness,
            )

            # Add audio
            self._add_audio_track(
                str(temp_video), output_path, bg_info, fg_info, temporal
            )

    def _frame_generator(self, video_path: str):
        """Yields frames from a video file sequentially.

        This generator provides sequential frame access which is much faster
        than random seeking. It's designed to eliminate the costly seek operations
        that slow down compositing.

        Args:
            video_path: Path to video file

        Yields:
            tuple: (frame_index, frame_array) for each frame
        """
        cap = cv2.VideoCapture(video_path)
        if not cap.isOpened():
            msg = f"Cannot open video file: {video_path}"
            raise OSError(msg)

        frame_idx = 0
        try:
            while True:
                ret, frame = cap.read()
                if not ret:
                    break
                yield frame_idx, frame
                frame_idx += 1
        finally:
            cap.release()

    def _compose_with_opencv(
        self,
        bg_info: VideoInfo,
        fg_info: VideoInfo,
        output_path: str,
        spatial: SpatialAlignment,
        alignments: list[FrameAlignment],
        blend: bool = False,
        border_thickness: int = 8,
    ):
        """Compose video using sequential reads for maximum performance.

        This optimized version eliminates random seeking by reading both video
        files sequentially. This provides a 10-100x speedup compared to the
        previous random-access approach.

        How it works:
        - Create generators that read each video file sequentially
        - Advance each generator to the frame we need
        - Since alignments are typically in ascending order, we mostly move forward
        - Eliminates costly seek operations that were the main bottleneck
        """
        if not alignments:
            logger.warning("No frame alignments provided. Cannot compose video.")
            return

        writer = self.processor.create_video_writer(
            output_path,
            bg_info.width,
            bg_info.height,
            fg_info.fps,  # Use FG fps to preserve all FG frames
        )

        bg_gen = self._frame_generator(bg_info.path)
        fg_gen = self._frame_generator(fg_info.path)

        current_bg_frame = None
        current_fg_frame = None
        current_bg_idx = -1
        current_fg_idx = -1

        frames_written = 0
        total_frames = len(alignments)

        # Create blend mask if needed
        blend_mask = None
        if blend:
            blend_mask = self.temporal_aligner.create_blend_mask(
                spatial, fg_info, bg_info, border_thickness
            )
            logger.info(f"Using blend mode with {border_thickness}px gradient")

        try:
            # Use proper progress bar for video composition
            with Progress(
                TextColumn("[progress.description]{task.description}"),
                BarColumn(),
                TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
                TextColumn("({task.completed}/{task.total} frames)"),
                TimeRemainingColumn(),
                console=console,
                transient=False,
            ) as progress:
                task = progress.add_task("Composing frames", total=total_frames)

                for _i, alignment in enumerate(alignments):
                    needed_fg_idx = alignment.fg_frame_idx
                    needed_bg_idx = alignment.bg_frame_idx

                    # Advance foreground generator to the needed frame
                    while current_fg_idx < needed_fg_idx:
                        try:
                            current_fg_idx, current_fg_frame = next(fg_gen)
                        except StopIteration:
                            logger.error("Reached end of foreground video unexpectedly")
                            break

                    # Advance background generator to the needed frame
                    while current_bg_idx < needed_bg_idx:
                        try:
                            current_bg_idx, current_bg_frame = next(bg_gen)
                        except StopIteration:
                            logger.error("Reached end of background video unexpectedly")
                            break

                    if current_fg_frame is None or current_bg_frame is None:
                        logger.error("Frame generator did not yield a frame. Aborting.")
                        break

                    # We now have the correct pair of frames
                    composite = self._overlay_frames(
                        current_bg_frame, current_fg_frame, spatial, blend_mask
                    )
                    writer.write(composite)
                    frames_written += 1

                    # Update progress bar
                    progress.update(task, advance=1)

        except StopIteration:
            logger.warning("Reached end of a video stream unexpectedly.")
        finally:
            writer.release()
            logger.info(f"Wrote {frames_written} frames to {output_path}")

    def _overlay_frames(
        self,
        bg_frame: np.ndarray,
        fg_frame: np.ndarray,
        spatial: SpatialAlignment,
        blend_mask: np.ndarray | None = None,
    ) -> np.ndarray:
        """Overlay foreground on background with spatial alignment and optional blending."""
        composite = bg_frame.copy()

        # Apply scaling if needed
        if spatial.scale_factor != 1.0:
            new_w = int(fg_frame.shape[1] * spatial.scale_factor)
            new_h = int(fg_frame.shape[0] * spatial.scale_factor)
            fg_frame = cv2.resize(
                fg_frame, (new_w, new_h), interpolation=cv2.INTER_AREA
            )
            # Also scale blend mask if provided
            if blend_mask is not None:
                blend_mask = cv2.resize(
                    blend_mask, (new_w, new_h), interpolation=cv2.INTER_AREA
                )

        fg_h, fg_w = fg_frame.shape[:2]
        bg_h, bg_w = bg_frame.shape[:2]

        # Calculate ROI with bounds checking
        x_start = max(0, spatial.x_offset)
        y_start = max(0, spatial.y_offset)
        x_end = min(bg_w, spatial.x_offset + fg_w)
        y_end = min(bg_h, spatial.y_offset + fg_h)

        # Calculate foreground crop if needed
        fg_x_start = max(0, -spatial.x_offset)
        fg_y_start = max(0, -spatial.y_offset)
        fg_x_end = fg_x_start + (x_end - x_start)
        fg_y_end = fg_y_start + (y_end - y_start)

        # Overlay
        if x_end > x_start and y_end > y_start:
            fg_crop = fg_frame[fg_y_start:fg_y_end, fg_x_start:fg_x_end]
            bg_slice = composite[y_start:y_end, x_start:x_end]

            if blend_mask is not None:
                # Apply alpha blending using the blend mask
                mask_crop = blend_mask[fg_y_start:fg_y_end, fg_x_start:fg_x_end]

                # Ensure mask has same dimensions for broadcasting
                if len(fg_crop.shape) == 3:  # Color image
                    mask_crop = np.stack([mask_crop] * 3, axis=2)

                # Alpha blend: result = fg * alpha + bg * (1 - alpha)
                composite[y_start:y_end, x_start:x_end] = (
                    fg_crop.astype(np.float32) * mask_crop
                    + bg_slice.astype(np.float32) * (1.0 - mask_crop)
                ).astype(np.uint8)
            else:
                # Simple overlay (original behavior)
                composite[y_start:y_end, x_start:x_end] = fg_crop

        return composite

    def _add_audio_track(
        self,
        video_path: str,
        output_path: str,
        bg_info: VideoInfo,
        fg_info: VideoInfo,
        temporal: TemporalAlignment,
    ):
        """Add audio track to the composed video."""
        # Prefer foreground audio as it's "better quality"
        if fg_info.has_audio:
            audio_source = fg_info.path
            audio_offset = 0.0  # FG audio is already aligned
            logger.info("Using foreground audio track")
        elif bg_info.has_audio:
            audio_source = bg_info.path
            audio_offset = -temporal.offset_seconds  # Compensate for alignment
            logger.info("Using background audio track")
        else:
            # No audio, just copy video
            logger.info("No audio tracks available")
            Path(video_path).rename(output_path)
            return

        # Merge audio with ffmpeg
        try:
            input_video = ffmpeg.input(video_path)

            if audio_offset != 0:
                input_audio = ffmpeg.input(audio_source, itsoffset=audio_offset)
            else:
                input_audio = ffmpeg.input(audio_source)

            stream = ffmpeg.output(
                input_video["v"],
                input_audio["a"],
                output_path,
                c="copy",
                acodec="aac",
                shortest=None,
            )

            ffmpeg.run(stream, overwrite_output=True, capture_stderr=True)

        except ffmpeg.Error as e:
            logger.error(f"Audio merge failed: {e.stderr.decode()}")
            # Fallback - save without audio
            Path(video_path).rename(output_path)
            logger.warning("Saved video without audio")
</file>

<file path="README.md">
# `vidkompy`

**Intelligent Video Overlay and Synchronization**

`vidkompy` is a powerful command-line tool engineered to overlay a foreground video onto a background video with exceptional precision and automatic alignment. The system intelligently handles discrepancies in resolution, frame rate, duration, and audio, prioritizing content integrity and synchronization accuracy over raw processing speed.

The core philosophy of `vidkompy` is to treat the **foreground video as the definitive source of quality and timing**. All its frames are preserved without modification or re-timing. The background video is dynamically adapted—stretched, retimed, and selectively sampled—to synchronize perfectly with every frame of the foreground content, ensuring a seamless and coherent final output.

---

## Features

- **Automatic Spatial Alignment**: Intelligently detects the optimal x/y offset to position the foreground video within the background, even if they are cropped differently.
- **Advanced Temporal Synchronization**: Aligns videos with different start times, durations, and frame rates, eliminating temporal drift and ensuring content matches perfectly over time.
- **Foreground-First Principle**: Guarantees that every frame of the foreground video is included in the output, preserving its original timing and quality. The background video is adapted to match the foreground.
- **Drift-Free Alignment**: Utilizes Dynamic Time Warping (DTW) to create a globally optimal, monotonic alignment, preventing the common "drift-and-catchup" artifacts seen with simpler methods.
- **High-Performance Processing**: Leverages multi-core processing, perceptual hashing, and optimized video I/O to deliver results quickly.
- Frame fingerprinting is 100-1000x faster than traditional pixel-wise comparison.
- Sequential video composition is 10-100x faster than random-access methods.
- **Smart Audio Handling**: Automatically uses the foreground audio track if available, falling back to the background audio. The audio is correctly synchronized with the final video.
- **Flexible Operation Modes**: Supports specialized modes like `border` matching for aligning content based on visible background edges, and `smooth` blending for seamless visual integration.

## How It Works

The `vidkompy` pipeline is a multi-stage process designed for precision and accuracy:

1.  **Video Analysis**: The tool begins by probing both background (BG) and foreground (FG) videos using `ffprobe` to extract essential metadata: resolution, frames per second (FPS), duration, frame count, and audio stream information.

2.  **Spatial Alignment**: To determine _where_ to place the foreground on the background, `vidkompy` extracts a sample frame from the middle of each video (where content is most likely to be stable). It then calculates the optimal (x, y) offset.

3.  **Temporal Alignment**: This is the core of `vidkompy`. To determine _when_ to start the overlay and how to map frames over time, the tool generates "fingerprints" of frames from both videos and uses Dynamic Time Warping (DTW) to find the best alignment path. This ensures every foreground frame is matched to the most suitable background frame.

4.  **Video Composition**: Once the spatial and temporal alignments are known, `vidkompy` composes the final video. It reads both video streams sequentially (for maximum performance) and, for each foreground frame, fetches the corresponding background frame as determined by the alignment map. The foreground is then overlaid at the correct spatial position.

5.  **Audio Integration**: After the silent video is composed, `vidkompy` adds the appropriate audio track (preferring the foreground's audio) with the correct offset to ensure it's perfectly synchronized with the video content.

## The Algorithms

`vidkompy` employs several sophisticated algorithms to achieve its high-precision results.

### Frame Fingerprinting (Perceptual Hashing)

Instead of comparing the millions of pixels in a frame, `vidkompy` creates a tiny, unique "fingerprint" (a hash) for each frame. Comparing these small fingerprints is thousands of times faster and smart enough to ignore minor changes from video compression.

---

The `FrameFingerprinter` module is designed for ultra-fast and robust frame comparison. It uses perceptual hashing, which generates a compact representation of a frame's visual structure.

The process works as follows:

1.  **Standardization**: The input frame is resized to a small, standard size (e.g., 64x64 pixels) and converted to grayscale. This ensures consistency and focuses on structural information over color.
2.  **Multi-Algorithm Hashing**: To improve robustness, `vidkompy` computes several types of perceptual hashes for each frame, as different algorithms are sensitive to different visual features:
- `pHash` (Perceptual Hash): Analyzes the frequency domain (using DCT), making it robust to changes in brightness, contrast, and gamma correction.
- `AverageHash`: Computes a hash based on the average color of the frame.
- `ColorMomentHash`: Captures the color distribution statistics of the frame.
- `MarrHildrethHash`: Detects edges and shapes, making it sensitive to structural features.
3.  **Combined Fingerprint**: The results from these hashers, along with a color histogram, are combined into a single "fingerprint" dictionary for the frame.
4.  **Comparison**: To compare two frames, their fingerprints are compared. The similarity is calculated using a weighted average of the normalized Hamming distance between their hashes and the correlation between their histograms. The weights are tuned based on the reliability of each hash type for video content. This entire process is parallelized across multiple CPU cores for maximum speed.

### Spatial Alignment (Template Matching)

To find the correct position for the foreground video, the tool takes a screenshot from the middle of it and searches for that exact image within a screenshot from the background video.

---

Spatial alignment determines the `(x, y)` coordinates at which to overlay the foreground frame onto the background. `vidkompy` uses a highly accurate and efficient method based on template matching.

1.  **Frame Selection**: A single frame is extracted from the temporal midpoint of both the foreground and background videos. This is done to get a representative frame, avoiding potential opening/closing titles or black frames.
2.  **Grayscale Conversion**: The frames are converted to grayscale. This speeds up the matching process by 3x and makes the alignment more robust to minor color variations between the videos.
3.  **Template Matching**: The core of the alignment is `cv2.matchTemplate` using the `TM_CCOEFF_NORMED` method. This function effectively "slides" the smaller foreground frame image across the larger background frame image and calculates a normalized cross-correlation score at each position.
4.  **Locating the Best Match**: The position with the highest correlation score (from `cv2.minMaxLoc`) is considered the best match. This location `(x_offset, y_offset)` represents the top-left corner where the foreground should be placed. The confidence of this match is the correlation score itself, which typically approaches `1.0` for a perfect match.
5.  **Scaling**: The system checks if the foreground video is larger than the background. If so, it is scaled down to fit, and the scale factor is recorded.

### Temporal Alignment Engines

`vidkompy` offers five temporal alignment engines, each with different trade-offs between speed, accuracy, and approach:
- **Fast** (default): Quick processing with perceptual hashing
- **Precise**: Maximum accuracy with multi-resolution alignment
- **Mask**: Enhanced precise engine with explicit masking
- **Tunnel Full**: Direct pixel comparison with sliding windows
- **Tunnel Mask**: Pixel comparison focused on content regions

---

Temporal alignment is the most critical and complex part of `vidkompy`. The goal is to create a mapping `FrameAlignment(fg_frame_idx, bg_frame_idx)` for every single foreground frame. `vidkompy` provides five distinct engines for this task:

#### Fast Engine (Default)

The **Fast Engine** uses **Dynamic Time Warping (DTW)** with perceptual hashing for efficient alignment:

1.  **Frame Sampling & Fingerprinting**: The tool samples frames sparsely based on the `max_keyframes` parameter and computes their perceptual fingerprints using multiple hash algorithms (pHash, AverageHash, ColorMomentHash, MarrHildrethHash).
2.  **Cost Matrix Construction**: A cost matrix is built where `cost(i, j)` is the "distance" (i.e., `1.0 - similarity`) between the fingerprint of foreground frame `i` and background frame `j`.
3.  **DTW with Constraints**: The DTW algorithm finds the lowest-cost path through this matrix with:
   - **Monotonicity**: The path can only move forward in time, preventing temporal jumps
   - **Sakoe-Chiba Band**: Constrains the search to a window around the diagonal (reduces complexity from O(N²) to O(N×w))
4.  **Direct Mapping Mode**: With `max_keyframes=1` (default in fast mode), the engine forces direct frame mapping to eliminate drift entirely.
5.  **Interpolation**: For sparse sampling, the engine linearly interpolates between matched keyframes to create a complete alignment map.

**Characteristics:**
- Processing time: ~15 seconds for an 8-second video
- Minimal drift with direct mapping mode
- Suitable for most use cases

#### Precise Engine (Advanced)

The **Precise Engine** implements a sophisticated multi-resolution approach for maximum accuracy. **Recent enhancements include improved drift correction using polynomial models, adaptive blending, and Savitzky-Golay smoothing to address temporal inconsistencies.**

1.  **Multi-Resolution Hierarchical Alignment**:
   - Creates temporal pyramids at multiple resolutions (1/16, 1/8, 1/4, 1/2, full)
   - Performs coarse-to-fine alignment, starting at the lowest resolution
   - Each level refines the previous level's mapping
   - Applies drift correction (now enhanced) every 100 frames

2.  **Keyframe Detection and Anchoring**:
   - Automatically detects keyframes based on temporal changes using Gaussian filtering
   - Aligns keyframes between videos as anchor points
   - Forces alignment at keyframes to prevent long-range drift
   - Detects scene changes and content transitions

3.  **Bidirectional DTW**:
   - Runs DTW in both forward and backward directions
   - Averages the two alignment paths to reduce systematic bias
   - Provides more robust alignment for videos with varying content

4.  **Sliding Window Refinement**:
   - Refines alignment in 30-frame windows
   - Searches locally for optimal alignment adjustments
   - Applies Gaussian smoothing for smooth transitions
   - Ensures strict monotonicity throughout

5.  **Confidence-Based Weighting**:
   - Computes confidence scores for each alignment
   - Weights multiple alignment methods based on their confidence
   - Combines results for optimal accuracy

**Characteristics:**
- Processing time: ~5 minutes for an 8-second video (includes full frame extraction)
- Virtually eliminates all temporal drift
- Handles complex scenarios with varying frame rates and content changes
- Best for critical applications requiring perfect synchronization

#### Tunnel Full Engine (Direct Comparison)

The **Tunnel Full Engine** uses direct pixel-by-pixel frame comparison with a sliding window approach:

1. **Bidirectional Matching**:
   - **Forward Pass**: Starts from the first FG frame, searches for best match in BG within a sliding window
   - **Backward Pass**: Starts from the last FG frame, searches backward
   - Merges both passes for robust alignment

2. **Sliding Window Constraint**:
   - Enforces monotonicity by design - can only search forward from the last matched frame
   - Window size controls the maximum temporal displacement
   - Prevents temporal jumps and ensures smooth progression

3. **Direct Pixel Comparison**:
   - Compares actual pixel values between FG and BG frames
   - No information loss from hashing or fingerprinting
   - More sensitive to compression artifacts but potentially more accurate

**Characteristics:**
- Processing time: Varies with window size and video resolution
- Zero drift by design due to monotonic constraints
- Best for videos with minimal compression artifacts
- Suitable when perceptual hashing misses subtle details

#### Tunnel Mask Engine (Content-Focused Comparison)

The **Tunnel Mask Engine** extends the tunnel approach with intelligent masking:

1. **Content Mask Generation**:
   - Automatically detects content regions (non-black areas) in FG frames
   - Creates binary mask to focus comparison on actual content
   - Helps with letterboxed or pillarboxed videos

2. **Masked Comparison**:
   - Only compares pixels within the mask region
   - Ignores black borders and letterboxing
   - More robust for videos with varying aspect ratios

3. **Same Bidirectional Approach**:
   - Uses forward and backward passes like Tunnel Full
   - Applies mask during all comparisons
   - Maintains monotonicity constraints

**Characteristics:**
- Similar performance to Tunnel Full
- Better handling of videos with black borders
- More accurate for content with letterboxing
- Ideal for videos where content doesn't fill the entire frame

#### Engine Comparison

| Aspect | Fast | Precise | Mask | Tunnel Full | Tunnel Mask |
|--------|------|---------|------|-------------|-------------|
| **Algorithm** | DTW + hashing | Multi-res DTW | Multi-res + mask | Direct pixel | Masked pixel |
| **Speed** | ~2x real-time | ~40x real-time | ~40x real-time | ~10-20x real-time | ~10-20x real-time |
| **Drift** | Minimal | Minimal | Minimal | Zero (monotonic) | Zero (monotonic) |
| **Memory** | Low | High | High | Medium | Medium |
| **Best For** | Quick results | Complex videos | Cropped content | Clean sources | Letterboxed |

## Usage

### Prerequisites

You must have the **FFmpeg** binary installed on your system and accessible in your system's `PATH`. `vidkompy` depends on it for all video and audio processing tasks.

### Installation

The tool is a Python package. It is recommended to install it from the repository to get the latest version.

```bash
# Clone the repository
git clone https://github.com/twardoch/vidkompy.git
cd vidkompy

# Install using uv (or pip)
uv pip install .
```

### Command-Line Interface (CLI)

The tool is run from the command line, providing paths to the background and foreground videos.

**Basic Examples:**

```bash
# Fast engine with direct mapping (default, no drift)
python -m vidkompy --bg background.mp4 --fg foreground.mp4

# Precise engine for maximum accuracy (slower but perfect sync)
python -m vidkompy --bg background.mp4 --fg foreground.mp4 --engine precise

# Tunnel engines for direct pixel comparison (no drift)
python -m vidkompy --bg bg.mp4 --fg fg.mp4 --engine tunnel_full --window 60
python -m vidkompy --bg bg.mp4 --fg letterboxed.mp4 --engine tunnel_mask

# Custom output path
python -m vidkompy --bg bg.mp4 --fg fg.mp4 --output result.mp4
```

**CLI Help:**

```
INFO: Showing help with the command '__main__.py -- --help'.

NAME
    __main__.py - Overlay foreground video onto background video with intelligent alignment.

SYNOPSIS
    __main__.py BG FG <flags>

DESCRIPTION
    Overlay foreground video onto background video with intelligent alignment.

POSITIONAL ARGUMENTS
    BG
        Type: str | pathlib.Path
        Background video path
    FG
        Type: str | pathlib.Path
        Foreground video path

FLAGS
    -o, --output=OUTPUT
        Type: Optional[str | pathlib...
        Default: None
        Output video path (auto-generated if not provided)
    -e, --engine=ENGINE
        Type: str
        Default: 'fast'
        Temporal alignment engine - 'fast', 'precise', 'mask', 'tunnel_full', or 'tunnel_mask' (default: 'fast')
    -m, --margin=MARGIN
        Type: int
        Default: 8
        Border thickness for border matching mode (default: 8)
    -s, --smooth=SMOOTH
        Type: bool
        Default: False
        Enable smooth blending at frame edges
    -g, --gpu=GPU
        Type: bool
        Default: False
        Enable GPU acceleration (future feature)
    -v, --verbose=VERBOSE
        Type: bool
        Default: False
        Enable verbose logging

NOTES
    You can also use flags syntax for POSITIONAL ARGUMENTS
```

## Performance

Recent updates have significantly improved `vidkompy`'s performance and accuracy:

### Real-World Performance Comparison

Based on actual benchmarks with an 8-second test video (1920x1080 background, 1920x870 foreground, ~480 frames):

| Engine | Processing Time | Speed Ratio | Drift at 1s | Drift at End | Notes |
|--------|----------------|-------------|-------------|--------------|-------|
| **Fast (default)** | 15.8 seconds | ~2x real-time | Minimal | Minimal | Direct mapping prevents drift |
| **Precise** | 5m 18s | ~40x real-time | Less drift | Minimal | Full frame extraction + multi-resolution |

**Key Performance Insights:**

- **Fast Engine**: Processes at approximately 2x real-time speed. With `max_keyframes=1` (default), it uses direct frame mapping which completely eliminates drift while maintaining fast performance.

- **Precise Engine**: While significantly slower (~40x real-time), it provides superior alignment accuracy, especially for complex videos. Interestingly, it shows less drift at the 1-second mark compared to the fast engine, though both engines perform well at video endpoints.

### Technical Optimizations

- **Drift Elimination**: The fast engine now defaults to `max_keyframes=1`, forcing direct frame-to-frame mapping that eliminates temporal drift entirely.
- **Optimized Compositing**: Sequential frame reading instead of random access yields a **10-100x speedup** in the final composition stage.
- **Parallel Processing**: Frame fingerprinting and cost matrix computation leverage all available CPU cores.
- **Perceptual Hashing**: Frame comparison is **100-1000x faster** than pixel-wise methods while maintaining accuracy.
- **Memory Efficiency**: The fast engine uses streaming processing, while the precise engine trades memory for accuracy by loading all frames.

### Choosing the Right Engine

**Use the Fast Engine (default) when:**
- You need quick results (2x real-time processing)
- The videos are already reasonably synchronized
- Minor imperfections are acceptable
- Processing many videos in batch

**Use the Precise Engine when:**
- Perfect synchronization is critical
- Videos have complex timing variations
- Content quality justifies longer processing time
- Working with professionally edited content

## Development

To contribute to `vidkompy`, set up a development environment using `hatch`.

### Setup

1.  Clone the repository.
2.  Ensure you have `hatch` installed (`pip install hatch`).
3.  The project is managed through `hatch` environments defined in `pyproject.toml`.

### Key Commands

Run these commands from the root of the repository.

- **Run Tests**:

```bash
hatch run test
```

- **Run Tests with Coverage Report**:

```bash
hatch run test-cov
```

- **Run Type Checking**:

```bash
hatch run type-check
```

- **Check Formatting and Linting**:

```bash
hatch run lint
```

- **Automatically Fix Formatting and Linting Issues**:

```bash
hatch run fix
```

## License

This project is licensed under the MIT License. See the [LICENSE](https://www.google.com/search?q=LICENSE) file for details.
</file>

<file path="CHANGELOG.md">
# Changelog

All notable changes to vidkompy will be documented in this file.

The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [Unreleased]

### Added

- **Tunnel-Based Temporal Alignment Engines**: Implemented two new alignment engines based on direct frame comparison
    - `tunnel_full`: Uses full frame pixel comparison with sliding window approach
    - `tunnel_mask`: Uses masked frame comparison focusing on content regions
    - Both engines perform bidirectional matching (forward and backward passes)
    - Monotonicity enforced by design through sliding window constraints
    - Configurable window size, downsampling, and early stopping thresholds
    - Designed to eliminate temporal drift through direct frame matching

### Added
- **Numba JIT Optimization**: Integrated Numba JIT compilation for performance-critical operations
    - Added `numba>=0.58.0` dependency to `pyproject.toml`
    - Created `numba_optimizations.py` module with optimized implementations
    - DTW cost matrix computation now 5-20x faster with parallelized distance calculations
    - Frame fingerprint batch comparisons now 3-10x faster with vectorized operations
    - Multi-resolution drift correction optimized with JIT-compiled polynomial fitting
    - Automatic fallback to standard implementation if Numba compilation fails
    - First-run compilation overhead mitigated by caching compiled functions

### Changed
- **DTW Algorithm Performance**: Optimized DTWAligner with Numba JIT compilation
    - `_build_dtw_matrix()` uses parallel computation for large alignments
    - `_find_optimal_path()` uses optimized backtracking algorithm
    - Added feature extraction method for converting fingerprints to numpy arrays
    - Intelligent switching between Numba and standard implementation based on problem size

- **Frame Fingerprinting Performance**: Enhanced FrameFingerprinter with batch operations
    - Added `compare_fingerprints_batch()` method for parallel fingerprint comparisons
    - Optimized histogram correlation computation with Numba
    - Weighted similarity calculation now JIT-compiled
    - Batch Hamming distance computation parallelized across multiple cores

- **Multi-Resolution Alignment**: Accelerated drift correction in precise engine
    - Polynomial drift correction now uses Numba-optimized implementation
    - Adaptive blend factor calculation accelerated
    - Monotonicity enforcement optimized with compiled loops

- **Precise Engine Enhancement**: Implemented Idea 1 from `SPEC.md` for the `precise` temporal alignment engine.
    - Updated `PreciseEngineConfig` in `multi_resolution_aligner.py` to include new parameters for enhanced drift correction (polynomial model, adaptive blend factor) and Savitzky-Golay smoothing.
    - Modified `MultiResolutionAligner.apply_drift_correction` to use polynomial regression as a baseline for drift correction and to incorporate an adaptive blend factor, offering more nuanced adjustments than simple linear interpolation.
    - Added a global Savitzky-Golay smoothing pass in `MultiResolutionAligner.align` after drift correction and before the final interpolation to full resolution. This aims to reduce high-frequency oscillations ("flag wave" effect) in the temporal mapping.
    - Improved internal logic in `apply_drift_correction` and `refine_alignment` for clarity and robustness, including better handling of segment boundaries and loop variables.
    - Added safety checks in `interpolate_full_mapping` and `align` to handle empty or very short mappings, preventing potential errors.

### Fixed

- **Parameter Mismatch**: Fixed DTWAligner constructor parameter name from 'window' to 'window' in precise alignment engines to resolve TypeError during precise alignment initialization
- **Temporal Drift Issue**: Fixed severe drift at 5-second mark by implementing spatial cropping during temporal alignment and adjusting drift correction parameters
- **Spatial Cropping**: Added background frame cropping to match foreground region during temporal alignment, eliminating false dissimilarities from non-overlapping areas
- **Drift Correction Tuning**: Increased drift correction interval from 32 to 100 frames and blend factor from 0.7 to 0.85 to prevent overcorrection

### Added

- **Multi-Engine Architecture**: Added --engine CLI parameter to support multiple temporal alignment engines
- **Precise Engine Implementation**: Implemented advanced multi-resolution temporal alignment engine with drift correction
- **Engine Validation**: Added proper validation and error messages for engine selection
- **Comprehensive Engine Documentation**: Updated README.md with detailed explanations of both fast and precise engines
- **Performance Benchmarks**: Added real-world performance comparison data showing 2x vs 40x real-time processing
- **Usage Examples**: Added multiple CLI examples demonstrating different engine configurations

### Major Refactoring: Code Simplification & Performance

### Removed

- **Audio Alignment**: Removed all audio-based temporal alignment functionality (align_audio, extract_audio)
- **Feature Matching**: Removed ORB feature-based spatial alignment method
- **FAST Mode**: Removed audio-first temporal alignment mode
- **CLI Complexity**: Simplified CLI to essential parameters only
- **Redundant Options**: Removed match_time, match_space, temporal_align, skip_spatial_align, trim, max_keyframes, and window parameters
- **Dependencies**: Removed soundfile and scipy.signal as no longer needed

### Changed

- **Fixed Configuration**: Always uses border mode with DTW and template matching
- **Simplified API**: Reduced CLI to just bg, fg, output, border, blend, gpu, and verbose
- **Single Path**: Each operation now has only one implementation path
- **Code Reduction**: Approximately 40% reduction in codebase size

### Implementation Status vs Original Specs

- **SPEC5**: Fully implemented all quick wins and drift elimination features
- **SPEC4**: Partially implemented - removed alternative methods but kept optimized versions instead of full rewrite with FAISS/phase correlation

### Performance Improvements from SPEC5 Implementation

### Fixed

- **Temporal Drift**: Implemented adaptive keyframe density to prevent drift based on FPS differences
- **Border Mode Performance**: Enabled DTW with masked perceptual hashing for fast border mode alignment
- **Critical Performance**: Fixed compositing bottleneck by implementing sequential reading with generators
- **UI Bug**: Resolved "Only one live display may be active at once" error from nested Progress contexts
- **Compositing Speed**: Eliminated costly random seeks in video files during frame composition
- **Progress UX**: Removed annoying spinner displays for quick operations

### Added

- **Masked Fingerprinting**: Added compute_masked_fingerprint() method for border-aware perceptual hashing
- **Adaptive Keyframes**: calculate_adaptive_keyframe_count() adjusts density based on video characteristics
- **Benchmark Suite**: Created benchmark.py for comprehensive performance testing
- **Border Mode + DTW**: DTW temporal alignment now works with masked regions
- Sequential frame generators for optimal video reading performance
- Detailed frame composition progress bar with percentage, frame count, and time remaining
- Spatial alignment results now logged in non-verbose mode for better visibility
- Temporal alignment results now logged in non-verbose mode showing method, offset, and frame count
- Comprehensive docstrings explaining the "why" behind design decisions in all core modules
- SPEC4.md: Detailed performance improvement plan with DTW algorithm and perceptual hashing
- SPEC5.md: Drift elimination and performance optimization specification
- Border-based temporal alignment mode with mask generation
- Smooth alpha blending for frame edges
- Sliding window constraint for frame matching optimization

### Changed

- **Default Keyframes**: Reduced from 2000 to 200 for better drift prevention
- **Parallel Processing**: Cost matrix building already uses ThreadPoolExecutor for parallel computation
- **Border Mode Logic**: DTW no longer falls back to classic alignment in border mode
- **Performance**: Compositing now uses forward-only sequential reads instead of random seeks (10-100x speedup)
- **Performance**: Significantly reduced keyframe sampling when using SSIM (e.g., in border mode fallback), drastically improving speed for that specific path
- **Progress UX**: Quick tasks (video analysis, spatial alignment) now use simple logging instead of spinners
- **Progress Bars**: Frame composition shows meaningful progress bar instead of percentage logging
- **Default Mode**: Border-based temporal alignment is now the default for improved accuracy
- Progress bars now show time remaining for better user experience
- Maintained useful progress bars for time-intensive operations (DTW, cost matrix building)

### Documentation

- Added detailed docstrings to alignment_engine.py explaining architecture decisions
- Added detailed docstrings to spatial_alignment.py explaining algorithm choices
- Added detailed docstrings to temporal_alignment.py explaining current limitations
- Added detailed docstrings to video_processor.py explaining tool choices
- Created SPEC4.md with comprehensive improvement plan addressing performance and quality issues
- Created SPEC5.md with drift elimination strategies and performance targets

### Technical Details

- Implemented \_precompute_masked_video_fingerprints() for border mode DTW support
- Added \_compute_masked_frame_hash() for efficient masked perceptual hashing
- Removed SpinnerColumn from inner progress bars to prevent conflicts
- Added TimeRemainingColumn for better progress estimation
- Made outer progress transient to reduce visual clutter
</file>

<file path="src/vidkompy/vidkompy.py">
#!/usr/bin/env -S uv run -s
# /// script
# dependencies = [
# "fire", "rich", "loguru", "opencv-python", "numpy", "scipy",
# "ffmpeg-python", "soundfile", "scikit-image"
# ]
# ///
# this_file: src/vidkompy/vidkompy.py

"""
Intelligent video overlay tool with automatic spatial and temporal alignment.

This tool overlays foreground videos onto background videos with smart alignment:
- Preserves all foreground frames without retiming
- Finds optimal background frames for each foreground frame
- Supports audio-based and frame-based temporal alignment
- Automatic spatial alignment with template/feature matching
"""

import sys
from loguru import logger
from pathlib import Path

from .core.video_processor import VideoProcessor
from .core.alignment_engine import AlignmentEngine
from .models import MatchTimeMode, TemporalMethod


def main(
    bg: str | Path,
    fg: str | Path,
    output: str | Path | None = None,
    engine: str = "fast",
    drift_interval: int = 100,
    margin: int = 8,
    smooth: bool = False,
    gpu: bool = False,  # Future GPU acceleration support
    window: int = 0,
    verbose: bool = False,
):
    """Overlay foreground video onto background video with intelligent alignment.

    Args:
        bg: Background video path
        fg: Foreground video path
        output: Output video path (auto-generated if not provided)
        engine: Temporal alignment engine - 'fast', 'precise', 'mask', 
                'tunnel_full', or 'tunnel_mask' (default: 'fast')
        drift_interval: Frame interval for drift correction in precise/mask
                        engine (default: 100)
        margin: Border thickness for border matching mode (default: 8)
        smooth: Enable smooth blending at frame edges
        gpu: Enable GPU acceleration (future feature)
        verbose: Enable verbose logging
    """
    # Setup logging
    logger.remove()
    log_format_verbose = (
        "<green>{time:HH:mm:ss}</green> | <level>{level: <8}</level> | "
        "<cyan>{function}</cyan> - <level>{message}</level>"
    )
    log_format_default = (
        "<green>{time:HH:mm:ss}</green> | <level>{level: <8}</level> | "
        "<level>{message}</level>"
    )
    if verbose:
        logger.add(sys.stderr, format=log_format_verbose, level="DEBUG")
    else:
        logger.add(sys.stderr, format=log_format_default, level="INFO")
    
    # Log CLI options if verbose mode is enabled
    if verbose:
        logger.info("CLI options used:")
        logger.info(f"  Background video: {bg}")
        logger.info(f"  Foreground video: {fg}")
        logger.info(f"  Output path: {output}")
        logger.info(f"  Engine: {engine}")
        logger.info(f"  Drift interval: {drift_interval}")
        logger.info(f"  Window: {window}")
        logger.info(f"  Margin: {margin}")
        logger.info(f"  Smooth blending: {smooth}")
        logger.info(f"  GPU acceleration: {gpu}")
        logger.info(f"  Verbose logging: {verbose}")

    # Validate inputs
    bg_path = Path(bg)
    fg_path = Path(fg)

    if not bg_path.exists():
        logger.error(f"Background video not found: {bg}")
        return

    if not fg_path.exists():
        logger.error(f"Foreground video not found: {fg}")
        return

    # Generate output path if needed
    output_str: str
    if output is None:
        output_str = f"{bg_path.stem}_overlay_{fg_path.stem}.mp4"
        logger.info(f"Output path: {output_str}")
    else:
        output_str = str(output)

    # Validate output path
    output_path_obj = Path(output_str)
    if output_path_obj.exists():
        logger.warning(
            f"Output file {output_str} already exists. It will be overwritten."
        )

    # Ensure output directory exists
    output_path_obj.parent.mkdir(parents=True, exist_ok=True)

    # Validate engine parameter
    if engine not in ["fast", "precise", "mask", "tunnel_full", "tunnel_mask"]:
        err_msg = f"Invalid engine: {engine}. Must be 'fast', 'precise', 'mask', 'tunnel_full', or 'tunnel_mask'."
        logger.error(err_msg)
        return

    # Initialize config variables with defaults for 'fast' engine
    time_mode: MatchTimeMode = MatchTimeMode.PRECISE
    space_method: str = "template"
    temporal_method: TemporalMethod = TemporalMethod.CLASSIC
    max_keyframes: int = 1  # Default for fast engine (direct mapping)

    if engine == "fast":
        # Configuration for 'fast' is already set by defaults above
        pass
    elif engine == "precise" or engine == "mask":
        # Precise/Mask engines manage temporal method and keyframes internally
        # These are placeholders, actual behavior is in AlignmentEngine/TemporalAligner
        temporal_method = TemporalMethod.CLASSIC
        max_keyframes = 1000
    elif engine == "tunnel_full" or engine == "tunnel_mask":
        # Tunnel engines use direct frame comparison
        temporal_method = TemporalMethod.CLASSIC
        max_keyframes = 1  # Not used by tunnel engines
    # No else needed as engine is validated

    if gpu:
        logger.info("GPU acceleration not yet implemented")

    # Create processor and alignment engine
    processor = VideoProcessor()
    alignment = AlignmentEngine(
        processor=processor,
        verbose=verbose,
        max_keyframes=max_keyframes,
        engine_mode=engine,
        drift_interval=drift_interval,
        window=window,
    )

    # Process the videos
    try:
        alignment.process(
            bg_path=str(bg_path),
            fg_path=str(fg_path),
            output_path=output_str,
            time_mode=time_mode,
            space_method=space_method,
            temporal_method=temporal_method,
            skip_spatial=False,
            trim=True,
            border_thickness=margin,
            blend=smooth,
            window=window,
        )
    except Exception as e:
        logger.error(f"Processing failed: {e}")
        raise
</file>

<file path="PROGRESS.md">
# Progress: vidkompy Performance Improvements

## Completed Tasks

### Numba JIT Optimization Integration
- [x] Added numba>=0.58.0 dependency to pyproject.toml
- [x] Created numba_optimizations.py module with JIT-compiled functions
- [x] Optimized DTW cost matrix computation with parallel distance calculations
- [x] Implemented fast DTW path finding with optimized backtracking
- [x] Added batch fingerprint comparison with parallelized Hamming distances
- [x] Optimized histogram correlation computation
- [x] Implemented weighted similarity calculation with JIT compilation
- [x] Added polynomial drift correction optimization for precise engine
- [x] Integrated fallback mechanisms for when Numba compilation fails
- [x] Updated DTWAligner to use Numba optimizations for large alignments
- [x] Enhanced FrameFingerprinter with batch comparison methods
- [x] Modified MultiResolutionAligner to use optimized drift correction

### Performance Improvements Achieved
- [x] DTW cost matrix computation: 5-20x speedup
- [x] Frame fingerprint comparisons: 3-10x speedup
- [x] Multi-resolution drift correction: 2-5x speedup
- [x] Automatic optimization for non-trivial video sizes (>10 frames)

### Temporal Alignment Research & Design
- [x] Analyzed current temporal alignment implementation and identified drift issues
- [x] Researched best practices for precise video synchronization
- [x] Created SPEC.md with detailed design for 'precise' engine
- [x] Added --engine CLI parameter to support multiple alignment engines
- [x] Implemented precise temporal alignment engine with multi-resolution approach
- [x] Tested both engines and confirmed performance characteristics
- [x] Updated README.md with comprehensive documentation of both engines
- [x] Added detailed performance benchmarks and usage examples

### Drift Issue Resolution
- [x] Identified root cause of 5-second drift issue in precise engine
- [x] Implemented spatial cropping for temporal alignment (crop background to foreground region)
- [x] Added crop parameter support to VideoProcessor.extract_all_frames()
- [x] Integrated spatial alignment step before temporal alignment in precise engine
- [x] Tuned drift correction parameters (interval: 32→100, blend factor: 0.7→0.85)
- [x] Added drift tracking and logging for debugging
- [x] Verified fix resolves hand movement synchronization issue

### Precise Engine Improvement (Wave Drift Fix - Idea 1 from SPEC.md)
- [x] Updated `PreciseEngineConfig` with new parameters for enhanced drift correction (polynomial model, adaptive blend) and Savitzky-Golay smoothing.
- [x] Modified `MultiResolutionAligner.apply_drift_correction` to use polynomial regression for baseline drift and an adaptive blend factor.
- [x] Added global Savitzky-Golay smoothing pass in `MultiResolutionAligner.align` after drift correction and before full interpolation.
- [x] Adjusted loop variables and conditions in `apply_drift_correction` and `refine_alignment` for clarity and correctness.
- [x] Added safety checks for empty/short mappings in `interpolate_full_mapping` and `align`.

### Tunnel-Based Temporal Alignment Implementation
- [x] Implemented TunnelAligner base class with bidirectional matching framework
- [x] Created TunnelFullAligner for direct pixel-by-pixel frame comparison
- [x] Created TunnelMaskAligner with automatic content mask generation
- [x] Added sliding window constraints to enforce monotonicity
- [x] Implemented forward and backward pass algorithms
- [x] Added configurable early stopping and merge strategies
- [x] Integrated tunnel engines into TemporalAligner
- [x] Added CLI support for tunnel_full and tunnel_mask engines
- [x] Updated benchmark script to test new engines
- [x] Documented new engines in README.md

## Future Optimizations (Not Yet Implemented)

### Performance Enhancements
- [ ] GPU acceleration with CuPy for phase correlation
- [ ] FAISS integration for fast similarity search
- [ ] Replace OpenCV with PyAV for faster video I/O
- [ ] Implement sliding window refinement for drift correction (This was part of the precise engine, but could be reviewed/enhanced based on Idea 1's impact)
- [ ] Hierarchical multi-resolution matching (This is part of the precise engine, could be tuned)

### Architecture Improvements
- [ ] Replace template matching with phase correlation for spatial alignment
- [ ] Use neural embeddings (MobileNet) instead of perceptual hashes
- [ ] Implement proper fallback strategies for edge cases
- [ ] Add caching for repeated video pairs
- [ ] Implement Improvement Idea 2 (Optical Flow-Assisted Consistency) from SPEC.md
- [ ] Implement Improvement Idea 3 (Dominant Path DTW) from SPEC.md
- [ ] Fix CLI `-w` (window) parameter bug for `precise` engine (not used effectively).

### Code Quality
- [ ] Add comprehensive unit tests for new drift correction and smoothing.
- [ ] Create performance benchmark suite (or adapt existing `benchmark.sh` to test new config parameters).
- [ ] Add type hints throughout (ongoing).
- [ ] Improve error handling and recovery.
- [ ] Resolve remaining linter errors for line length in `multi_resolution_aligner.py`.
</file>

<file path="src/vidkompy/core/temporal_alignment.py">
#!/usr/bin/env python3
# this_file: src/vidkompy/core/temporal_alignment.py

"""
Temporal alignment module for synchronizing videos.

Implements frame-based temporal alignment with emphasis on
preserving all foreground frames without retiming.
"""

import cv2
import numpy as np
from skimage.metrics import structural_similarity as ssim
from loguru import logger
from rich.progress import (
    Progress,
    TextColumn,
    BarColumn,
    TimeRemainingColumn,
)
from rich.console import Console
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
import multiprocessing as mp

from vidkompy.models import VideoInfo, FrameAlignment, TemporalAlignment
from .video_processor import VideoProcessor
from .frame_fingerprint import FrameFingerprinter
from .dtw_aligner import DTWAligner
from .precise_temporal_alignment import PreciseTemporalAlignment
from .spatial_alignment import SpatialAligner
from .tunnel_aligner import TunnelFullAligner, TunnelMaskAligner, TunnelConfig

console = Console()


class TemporalAligner:
    """Handles temporal alignment between videos.

    This module synchronizes two videos in time, finding which frames
    correspond between them. This is the most complex part of vidkompy.

    Why temporal alignment is critical:
    - Videos may start at different times
    - Frame rates might differ
    - Some frames might be added/dropped in one video
    - The FG video timing must be preserved (it's the reference)

    Current implementation uses keyframe matching with interpolation.
    Future versions will use Dynamic Time Warping (see SPEC4.md).
    """

    def __init__(
        self,
        processor: VideoProcessor,
        max_keyframes: int = 200,
        drift_interval: int = 100,
        window: int = 100,
        engine_mode: str = "fast",
    ):
        """Initialize temporal aligner.

        Args:
            processor: Video processor instance
            max_keyframes: Maximum keyframes for frame matching
            drift_interval: Frame interval for drift correction
            window: DTW window size
            engine_mode: Alignment engine ('fast', 'precise', 'mask')
        """
        self.processor = processor
        self.max_keyframes = max_keyframes
        self.drift_interval = drift_interval
        self.use_perceptual_hash = True
        # Cache for pHash np.ndarray values for classic keyframe matching
        self.hash_cache: dict[str, dict[int, np.ndarray]] = {}
        self._current_mask: np.ndarray | None = None
        self.engine_mode = engine_mode
        self.use_precise_engine = engine_mode == "precise" or engine_mode == "mask"
        self.use_tunnel_engine = engine_mode == "tunnel_full" or engine_mode == "tunnel_mask"
        self.cli_window_size = window

        self.fingerprinter: FrameFingerprinter | None = None
        self.dtw_aligner = DTWAligner(window=window)
        self.use_dtw = True
        self.hasher: cv2.img_hash.PHash | None = None  # type: ignore
        self.precise_aligner = None
        self.tunnel_aligner = None

        try:
            if hasattr(cv2, "img_hash") and hasattr(cv2.img_hash, "PHash_create"):
                self.hasher = cv2.img_hash.PHash_create()  # type: ignore
                if self.hasher is not None:
                    logger.info("✓ Perceptual hashing enabled (pHash)")
                    self.use_perceptual_hash = True
                else:
                    # cv2.img_hash.PHash_create() returned None.
                    # Perceptual hashing disabled. Falling back to SSIM.
                    logger.warning("pHash_create() is None. Fallback to SSIM.")
                    self.use_perceptual_hash = False
            else:
                # cv2.img_hash.PHash_create not found.
                # Perceptual hashing disabled. Falling back to SSIM.
                logger.warning("pHash_create not found. Fallback to SSIM.")
                self.use_perceptual_hash = False
        except AttributeError:
            # cv2.img_hash module or PHash_create not available.
            # Ensure opencv-contrib-python is correctly installed.
            # Perceptual hashing disabled. Falling back to SSIM.
            logger.warning("cv2.img_hash missing. Fallback to SSIM.")
            self.use_perceptual_hash = False
        except Exception as e:
            logger.error(f"Error initializing perceptual hasher: {e}")
            self.use_perceptual_hash = False

        if self.hasher is None and self.use_perceptual_hash:
            logger.warning("Hasher is None, forcing SSIM.")
            self.use_perceptual_hash = False

    def calculate_adaptive_keyframe_count(
        self, fg_info: VideoInfo, bg_info: VideoInfo, target_drift_frames: float = 1.0
    ) -> int:
        """Calculate optimal keyframe count to prevent drift.

        Args:
            fg_info: Foreground video info
            bg_info: Background video info
            target_drift_frames: Maximum acceptable drift in frames

        Returns:
            Optimal number of keyframes
        """
        # Account for FPS difference
        fps_ratio = abs(bg_info.fps - fg_info.fps) / max(bg_info.fps, fg_info.fps)

        # More keyframes needed for higher FPS mismatch
        fps_factor = 1.0 + fps_ratio * 2.0

        # Calculate base requirement to keep interpolation gaps small
        base_keyframes = fg_info.frame_count / (target_drift_frames * 10)

        # Apply factors
        required_keyframes = int(base_keyframes * fps_factor)

        # Clamp to reasonable range
        return max(50, min(required_keyframes, fg_info.frame_count // 2))

    def align_frames(
        self, bg_info: VideoInfo, fg_info: VideoInfo, trim: bool = False
    ) -> TemporalAlignment:
        """Align videos using frame content matching.

        This method ensures ALL foreground frames are preserved without
        retiming. It finds the optimal background frame for each foreground
        frame.

        Args:
            bg_info: Background video metadata
            fg_info: Foreground video metadata
            trim: Whether to trim to overlapping segment

        Returns:
            TemporalAlignment with frame mappings
        """
        logger.info("Starting frame-based temporal alignment")

        # Use tunnel engine if enabled
        if self.use_tunnel_engine:
            logger.info(f"Using {self.engine_mode} temporal alignment engine")
            return self._align_frames_tunnel(bg_info, fg_info, trim)

        # Use precise engine if enabled
        if self.use_precise_engine:
            logger.info("Using precise temporal alignment engine")
            return self._align_frames_precise(bg_info, fg_info, trim)

        # Use DTW if enabled (default)
        if self.use_dtw:
            return self._align_frames_dtw(bg_info, fg_info, trim)

        # Otherwise use original keyframe matching
        keyframe_matches = self._find_keyframe_matches(bg_info, fg_info)

        if not keyframe_matches:
            logger.warning("No keyframe matches found, using direct mapping")
            # Fallback to simple frame mapping
            return self._create_direct_mapping(bg_info, fg_info)

        # Build complete frame alignment preserving all FG frames
        frame_alignments = self._build_frame_alignments(
            bg_info, fg_info, keyframe_matches, trim
        )

        # Calculate overall temporal offset
        first_match = keyframe_matches[0]
        offset_seconds = (first_match[0] / bg_info.fps) - (first_match[1] / fg_info.fps)

        return TemporalAlignment(
            offset_seconds=offset_seconds,
            frame_alignments=frame_alignments,
            method_used="frames",
            confidence=self._calculate_alignment_confidence(keyframe_matches),
        )

    def _align_frames_tunnel(
        self, bg_info: VideoInfo, fg_info: VideoInfo, trim: bool = False
    ) -> TemporalAlignment:
        """Align videos using tunnel-based direct frame comparison.

        Args:
            bg_info: Background video metadata
            fg_info: Foreground video metadata
            trim: Whether to trim to overlapping segment

        Returns:
            TemporalAlignment with frame mappings
        """
        # Initialize tunnel aligner if not already done
        if self.tunnel_aligner is None:
            config = TunnelConfig(
                window_size=self.cli_window_size if self.cli_window_size > 0 else 30,
                downsample_factor=0.5,  # Downsample to 50% for faster processing
                early_stop_threshold=0.05,
                merge_strategy="confidence_weighted"
            )
            
            if self.engine_mode == "tunnel_full":
                self.tunnel_aligner = TunnelFullAligner(config)
            else:  # tunnel_mask
                self.tunnel_aligner = TunnelMaskAligner(config)

        # Perform spatial alignment first
        logger.info("Performing spatial alignment for tunnel engine...")
        spatial_aligner = SpatialAligner()

        # Extract sample frames for spatial alignment
        bg_frames = self.processor.extract_frames(
            bg_info.path, [bg_info.frame_count // 2]
        )
        fg_frames = self.processor.extract_frames(
            fg_info.path, [fg_info.frame_count // 2]
        )

        if not bg_frames or not fg_frames:
            logger.error("Failed to extract sample frames for spatial alignment")
            return self._create_direct_mapping(bg_info, fg_info)

        bg_sample = bg_frames[0]
        fg_sample = fg_frames[0]

        # Get spatial alignment
        spatial_result = spatial_aligner.align(bg_sample, fg_sample)
        x_offset, y_offset = spatial_result.x_offset, spatial_result.y_offset
        
        logger.info(f"Spatial offset: ({x_offset}, {y_offset})")

        # Extract all frames for tunnel alignment
        logger.info("Extracting all frames for tunnel alignment...")
        
        # For tunnel engines, we need full frames without pre-cropping
        fg_all_frames = self.processor.extract_all_frames(
            fg_info.path,
            resize_factor=0.25  # Use 25% size for processing
        )
        
        bg_all_frames = self.processor.extract_all_frames(
            bg_info.path,
            resize_factor=0.25  # Use 25% size for processing
        )

        if fg_all_frames is None or bg_all_frames is None:
            logger.error("Failed to extract frames for tunnel alignment")
            return self._create_direct_mapping(bg_info, fg_info)

        # Scale offsets for downsampled frames
        scaled_x_offset = int(x_offset * 0.25)
        scaled_y_offset = int(y_offset * 0.25)

        # Perform tunnel alignment
        logger.info(f"Performing {self.engine_mode} alignment...")
        frame_alignments, confidence = self.tunnel_aligner.align(
            fg_all_frames,
            bg_all_frames,
            scaled_x_offset,
            scaled_y_offset,
            verbose=True
        )

        # Calculate temporal offset from first alignment
        if frame_alignments:
            first_align = frame_alignments[0]
            offset_seconds = (first_align.bg_frame / bg_info.fps) - (first_align.fg_frame / fg_info.fps)
        else:
            offset_seconds = 0.0

        return TemporalAlignment(
            offset_seconds=offset_seconds,
            frame_alignments=frame_alignments,
            method_used=self.engine_mode,
            confidence=confidence,
        )

    def _align_frames_precise(
        self, bg_info: VideoInfo, fg_info: VideoInfo, trim: bool = False
    ) -> TemporalAlignment:
        """Align videos using the precise multi-resolution engine.

        This method uses advanced techniques including:
        - Multi-resolution temporal pyramids
        - Keyframe anchoring
        - Bidirectional DTW
        - Sliding window refinement

        Args:
            bg_info: Background video metadata
            fg_info: Foreground video metadata
            trim: Whether to trim to overlapping segment

        Returns:
            TemporalAlignment with frame mappings
        """
        # Initialize precise aligner if not already done
        if self.precise_aligner is None:
            if self.fingerprinter is None:
                self.fingerprinter = FrameFingerprinter()
            self.precise_aligner = PreciseTemporalAlignment(
                fingerprinter=self.fingerprinter,
                verbose=False,  # Use default verbosity
                interval=self.drift_interval,
                cli_window_size=self.cli_window_size,
            )

        # Perform spatial alignment first to get crop coordinates
        logger.info("Performing spatial alignment to determine crop region...")
        spatial_aligner = SpatialAligner()

        # Extract sample frames for spatial alignment
        bg_frames = self.processor.extract_frames(
            bg_info.path, [bg_info.frame_count // 2]
        )
        fg_frames = self.processor.extract_frames(
            fg_info.path, [fg_info.frame_count // 2]
        )

        if not bg_frames or not fg_frames:
            logger.error("Failed to extract sample frames for spatial alignment")
            return self._create_direct_mapping(bg_info, fg_info)

        bg_sample = bg_frames[0]
        fg_sample = fg_frames[0]

        if bg_sample is None or fg_sample is None:
            logger.error("Failed to extract sample frames for spatial alignment")
            return self._create_direct_mapping(bg_info, fg_info)

        # Get spatial alignment
        spatial_result = spatial_aligner.align(bg_sample, fg_sample)
        x_offset = spatial_result.x_offset
        y_offset = spatial_result.y_offset

        logger.info(
            f"Spatial alignment: offset=({x_offset}, {y_offset}), confidence={spatial_result.confidence:.3f}"
        )

        # Extract all frames for precise alignment
        logger.info("Extracting frames for precise alignment...")

        # For background, extract with cropping to match foreground region
        crop_region = (x_offset, y_offset, fg_info.width, fg_info.height)
        bg_frames = self.processor.extract_all_frames(
            bg_info.path, resize_factor=0.25, crop=crop_region
        )

        # For foreground, extract without cropping
        fg_frames = self.processor.extract_all_frames(fg_info.path, resize_factor=0.25)

        if bg_frames is None or fg_frames is None:
            logger.error("Failed to extract frames")
            return self._create_direct_mapping(bg_info, fg_info)

        # Perform precise alignment
        try:
            frame_mapping, confidence = self.precise_aligner.align(fg_frames, bg_frames)
        except Exception as e:
            logger.error(f"Precise alignment failed: {e}")
            logger.info("Falling back to standard alignment")
            return self._align_frames_dtw(bg_info, fg_info, trim)

        # Convert mapping to frame alignments
        frame_alignments = []

        # Determine range based on trim flag
        if trim and len(frame_mapping) > 0:
            # Find valid range where background frames are available
            start_idx = 0
            end_idx = len(frame_mapping)

            # Find first valid mapping
            for i in range(len(frame_mapping)):
                if frame_mapping[i] >= 0:
                    start_idx = i
                    break

            # Find last valid mapping
            for i in range(len(frame_mapping) - 1, -1, -1):
                if frame_mapping[i] < bg_info.frame_count:
                    end_idx = i + 1
                    break
        else:
            start_idx = 0
            end_idx = len(frame_mapping)

        # Build frame alignments
        for fg_idx in range(start_idx, end_idx):
            if fg_idx < len(frame_mapping):
                bg_idx = int(frame_mapping[fg_idx])
                # Ensure bg_idx is within bounds
                bg_idx = max(0, min(bg_idx, bg_info.frame_count - 1))

                frame_alignments.append(
                    FrameAlignment(
                        fg_frame_idx=fg_idx,
                        bg_frame_idx=bg_idx,
                        similarity_score=confidence,  # Use overall confidence
                    )
                )

        # Calculate temporal offset
        if frame_alignments:
            first_align = frame_alignments[0]
            offset_seconds = (first_align.bg_frame_idx / bg_info.fps) - (
                first_align.fg_frame_idx / fg_info.fps
            )
        else:
            offset_seconds = 0.0

        logger.info(
            f"Precise alignment complete. Mapped {len(frame_alignments)} frames with confidence {confidence:.3f}"
        )

        return TemporalAlignment(
            offset_seconds=offset_seconds,
            frame_alignments=frame_alignments,
            method_used="precise",
            confidence=confidence,
        )

    def _precompute_frame_hashes(
        self, video_path: str, frame_indices: list[int], resize_factor: float = 0.125
    ) -> dict[int, np.ndarray]:
        """Pre-compute perceptual hashes for frames in parallel."""
        if not self.use_perceptual_hash or self.hasher is None:
            return {}

        # Check cache first
        if video_path in self.hash_cache and all(
            idx in self.hash_cache[video_path] for idx in frame_indices
        ):
            return {
                idx: self.hash_cache[video_path][idx]
                for idx in frame_indices
                if idx in self.hash_cache[video_path]
            }

        logger.debug(f"Pre-computing hashes for {len(frame_indices)} frames")
        start_time = time.time()

        # Extract frames
        frames = self.processor.extract_frames(video_path, frame_indices, resize_factor)

        # Compute hashes in parallel
        hashes: dict[int, np.ndarray] = {}
        with ThreadPoolExecutor(max_workers=mp.cpu_count()) as executor:
            future_to_idx = {
                executor.submit(self._compute_frame_hash, frame): idx
                for idx, frame in zip(frame_indices, frames, strict=False)
                if frame is not None
            }

            for future in as_completed(future_to_idx):
                idx = future_to_idx[future]
                try:
                    hash_value = future.result()  # This should be np.ndarray (pHash)
                    if hash_value is not None:
                        hashes[idx] = hash_value
                except Exception as e:
                    logger.warning(f"Failed to compute hash for frame {idx}: {e}")

        # Update cache (ensure it stores np.ndarray for pHash)
        if video_path not in self.hash_cache:
            self.hash_cache[video_path] = {}
        # This logic seems to intend to store full fingerprint dicts, but for cv2.norm we need pHash np.array
        # For _find_keyframe_matches, we need the pHash directly. Let's ensure this cache stores that.
        self.hash_cache[video_path].update(
            hashes
        )  # hashes should be dict[int, np.ndarray]

        elapsed = time.time() - start_time
        logger.debug(f"Computed {len(hashes)} hashes in {elapsed:.2f}s")

        return hashes

    def _precompute_masked_video_fingerprints(
        self,
        video_path: str,
        frame_indices: list[int],
        mask: np.ndarray,
        resize_factor: float = 0.25,
    ) -> dict[int, dict[str, np.ndarray]]:
        """Pre-compute masked fingerprints for video frames.

        Args:
            video_path: Path to video file
            frame_indices: List of frame indices to process
            mask: Binary mask for border regions
            resize_factor: Factor to resize frames before fingerprinting

        Returns:
            Dictionary mapping frame indices to fingerprints
        """
        if self.fingerprinter is None:
            msg = "Fingerprinter not initialized"
            raise RuntimeError(msg)

        logger.debug(f"Computing masked fingerprints for {len(frame_indices)} frames")
        start_time = time.time()

        # Extract frames
        frames = self.processor.extract_frames(video_path, frame_indices, resize_factor)

        # Compute masked fingerprints
        fingerprints = {}
        for idx, frame in zip(frame_indices, frames, strict=False):
            if frame is not None:
                # Resize mask to match frame size if needed
                if frame.shape[:2] != mask.shape[:2]:
                    resized_mask = cv2.resize(
                        mask.astype(np.float32),
                        (frame.shape[1], frame.shape[0]),
                        interpolation=cv2.INTER_LINEAR,
                    )
                    resized_mask = (resized_mask > 0.5).astype(np.uint8)
                else:
                    resized_mask = mask

                # Compute masked fingerprint
                fingerprint = self.fingerprinter.compute_masked_fingerprint(
                    frame, resized_mask
                )
                fingerprints[idx] = fingerprint

        elapsed = time.time() - start_time
        logger.debug(
            f"Computed {len(fingerprints)} masked fingerprints in {elapsed:.2f}s"
        )

        return fingerprints

    def _find_keyframe_matches(
        self, bg_info: VideoInfo, fg_info: VideoInfo
    ) -> list[tuple[int, int, float]]:
        """Find matching keyframes between videos using monotonic dynamic programming.

        Returns:
            List of (bg_frame_idx, fg_frame_idx, similarity) tuples

        Why keyframe matching:
        - Can't compare all frame pairs (too expensive)
        - Keyframes capture important moments
        - Interpolation fills in between keyframes

        Why uniform sampling:
        - Ensures coverage of entire video
        - Predictable behavior
        - Works for any content type

        Current issues (see SPEC4.md for solutions):
        - Independent matching breaks monotonicity
        - Cost matrix computation is slow
        - Poor interpolation between sparse keyframes
        """
        # Use adaptive keyframe density calculation
        adaptive_keyframes = self.calculate_adaptive_keyframe_count(fg_info, bg_info)
        effective_target_keyframes = min(
            self.max_keyframes, adaptive_keyframes, fg_info.frame_count
        )

        logger.info(f"Adaptive calculation suggests {adaptive_keyframes} keyframes")
        logger.info(
            f"Using {effective_target_keyframes} keyframes (clamped by max_keyframes={self.max_keyframes})"
        )

        if (
            not self.use_perceptual_hash or self.hasher is None
        ):  # Indicates SSIM will be used
            # If SSIM is used and the number of keyframes is high, warn the user.
            if effective_target_keyframes > 200:  # Threshold for SSIM warning
                logger.warning(
                    f"SSIM mode with high target of {effective_target_keyframes} keyframes."
                )
            else:
                logger.info(
                    f"SSIM mode. Target keyframes: {effective_target_keyframes}"
                )
        else:
            logger.info(
                f"Perceptual hash mode. Target keyframes: {effective_target_keyframes}"
            )

        sample_interval = max(1, fg_info.frame_count // effective_target_keyframes)

        logger.info(f"Sampling every {sample_interval} frames for keyframe matching")

        # Prepare indices - sample uniformly across the video
        fg_indices = list(range(0, fg_info.frame_count, sample_interval))
        # Always include first and last frames
        if 0 not in fg_indices:
            fg_indices.insert(0, 0)
        if fg_info.frame_count - 1 not in fg_indices:
            fg_indices.append(fg_info.frame_count - 1)

        # Sample more densely from background to allow flexibility
        bg_sample_interval = max(1, sample_interval // 2)
        bg_indices = list(range(0, bg_info.frame_count, bg_sample_interval))

        logger.info(
            f"Sampling {len(fg_indices)} FG frames and {len(bg_indices)} BG frames"
        )

        # Pre-compute all hashes if available
        if self.use_perceptual_hash and self.hasher is not None:
            logger.info("Pre-computing perceptual hashes...")

            # Check if we need masked hashing
            if self._current_mask is not None:
                logger.info("Computing masked perceptual hashes for border mode...")
                fg_hashes = self._precompute_masked_frame_hashes(
                    fg_info.path, fg_indices, 0.125
                )
                bg_hashes = self._precompute_masked_frame_hashes(
                    bg_info.path, bg_indices, 0.125
                )
            else:
                fg_hashes = self._precompute_frame_hashes(
                    fg_info.path, fg_indices, 0.125
                )
                bg_hashes = self._precompute_frame_hashes(
                    bg_info.path, bg_indices, 0.125
                )

            if not fg_hashes or not bg_hashes:
                logger.error("Failed to compute hashes, falling back to SSIM")
                self.use_perceptual_hash = False

        # Build cost matrix using dynamic programming approach
        logger.info("Building cost matrix for dynamic programming alignment...")
        cost_matrix = self._build_cost_matrix(bg_info, fg_info, bg_indices, fg_indices)

        if cost_matrix is None:
            logger.error("Failed to build cost matrix")
            return []

        # Find optimal monotonic path through cost matrix
        matches = self._find_optimal_path(cost_matrix, bg_indices, fg_indices)

        # Validate matches with higher quality check if needed
        if len(matches) < 10:
            logger.warning(
                f"Only found {len(matches)} matches, attempting refinement..."
            )
            matches = self._refine_matches(
                matches, bg_info, fg_info, bg_indices, fg_indices
            )

        logger.info(f"Found {len(matches)} monotonic keyframe matches")

        if self.use_perceptual_hash:
            logger.info("✓ Perceptual hashing provided significant speedup")

        return matches

    def _compute_frame_similarity(
        self, frame1: np.ndarray, frame2: np.ndarray, mask: np.ndarray | None = None
    ) -> float:
        """Compute similarity between two frames using perceptual hash or SSIM.

        Args:
            frame1: First frame to compare
            frame2: Second frame to compare
            mask: Optional binary mask to restrict comparison to specific regions
        """
        if mask is not None:
            frame1 = self._apply_mask_to_frame(frame1, mask)
            frame2 = self._apply_mask_to_frame(frame2, mask)

        if self.use_perceptual_hash and self.hasher is not None:
            # _compute_frame_hash now directly returns the pHash np.ndarray
            hash1_val = self._compute_frame_hash(frame1)
            hash2_val = self._compute_frame_hash(frame2)
            if hash1_val is None or hash2_val is None:
                return 0.0  # Or handle error appropriately
            distance = cv2.norm(hash1_val, hash2_val, cv2.NORM_HAMMING)
            max_distance = 64  # pHash is typically 64-bit
            similarity = 1.0 - (distance / max_distance)
            return float(similarity)
        else:
            return self._compute_ssim_similarity(frame1, frame2)

    def _compute_ssim_similarity(self, frame1: np.ndarray, frame2: np.ndarray) -> float:
        """Compute similarity between two frames using SSIM."""
        # Convert to grayscale
        gray1 = cv2.cvtColor(frame1, cv2.COLOR_BGR2GRAY)
        gray2 = cv2.cvtColor(frame2, cv2.COLOR_BGR2GRAY)

        # Ensure same size
        if gray1.shape != gray2.shape:
            gray2 = cv2.resize(gray2, (gray1.shape[1], gray1.shape[0]))

        # Compute SSIM
        score, _ = ssim(gray1, gray2, full=True)
        return float(score)

    def _precompute_masked_frame_hashes(
        self, video_path: str, frame_indices: list[int], resize_factor: float = 0.125
    ) -> dict[int, np.ndarray]:
        """Pre-compute masked perceptual hashes for frames in parallel."""
        if (
            not self.use_perceptual_hash
            or self.hasher is None
            or self._current_mask is None
        ):
            return {}

        # This cache should store pHash np.ndarrays
        # Check cache first - this assumes self.hash_cache[video_path] stores dict[int, np.ndarray]
        if video_path in self.hash_cache and all(
            idx in self.hash_cache[video_path] for idx in frame_indices
        ):
            return {
                idx: self.hash_cache[video_path][idx]
                for idx in frame_indices
                if idx in self.hash_cache[video_path]
            }

        logger.debug(f"Pre-computing masked hashes for {len(frame_indices)} frames")
        start_time = time.time()

        # Extract frames
        frames = self.processor.extract_frames(video_path, frame_indices, resize_factor)

        # Compute masked hashes in parallel
        hashes: dict[int, np.ndarray] = {}
        with ThreadPoolExecutor(max_workers=mp.cpu_count()) as executor:
            future_to_idx = {
                executor.submit(
                    self._compute_masked_frame_hash, frame, self._current_mask
                ): idx
                for idx, frame in zip(frame_indices, frames, strict=False)
                if frame is not None
            }

            for future in as_completed(future_to_idx):
                idx = future_to_idx[future]
                try:
                    hash_value = future.result()  # This should be np.ndarray (pHash)
                    if hash_value is not None:
                        hashes[idx] = hash_value
                except Exception as e:
                    logger.warning(
                        f"Failed to compute masked hash for frame {idx}: {e}"
                    )

        # Update cache with pHash np.ndarrays
        if video_path not in self.hash_cache:
            self.hash_cache[video_path] = {}
        self.hash_cache[video_path].update(hashes)

        elapsed = time.time() - start_time
        logger.debug(f"Computed {len(hashes)} masked hashes in {elapsed:.2f}s")

        return hashes

    def _compute_masked_frame_hash(
        self, frame: np.ndarray, mask: np.ndarray
    ) -> np.ndarray | None:
        """Compute perceptual hash for masked frame."""
        if frame is None or self.hasher is None:
            return None

        # Resize mask to match frame if needed
        if frame.shape[:2] != mask.shape[:2]:
            resized_mask = cv2.resize(
                mask.astype(np.float32),
                (frame.shape[1], frame.shape[0]),
                interpolation=cv2.INTER_LINEAR,
            )
            resized_mask = (resized_mask > 0.5).astype(np.uint8)
        else:
            resized_mask = mask

        # Apply mask to frame
        if len(frame.shape) == 3:
            masked = frame.copy()
            for c in range(frame.shape[2]):
                masked[:, :, c] = frame[:, :, c] * resized_mask
        else:
            masked = frame * resized_mask

        # Crop to bounding box of mask
        rows = np.any(resized_mask, axis=1)
        cols = np.any(resized_mask, axis=0)

        if not np.any(rows) or not np.any(cols):
            # Empty mask, compute hash on original
            resized = cv2.resize(frame, (32, 32))
        else:
            rmin, rmax = np.where(rows)[0][[0, -1]]
            cmin, cmax = np.where(cols)[0][[0, -1]]
            cropped = masked[rmin : rmax + 1, cmin : cmax + 1]

            # Resize for hashing
            resized = cv2.resize(cropped, (32, 32))

        # Convert to grayscale if needed
        if len(resized.shape) == 3:
            resized = cv2.cvtColor(resized, cv2.COLOR_BGR2GRAY)

        hash_value = self.hasher.compute(resized)  # This is the pHash np.ndarray
        return hash_value

    def _compute_frame_hash(self, frame: np.ndarray) -> np.ndarray | None:
        """Compute perceptual hash of a frame (specifically pHash)."""
        if frame is None or self.hasher is None:  # Check if hasher is initialized
            return None

        # Resize to standard size for consistent hashing
        resized = cv2.resize(frame, (32, 32), interpolation=cv2.INTER_AREA)

        # Convert to grayscale if needed
        if len(resized.shape) == 3:
            resized = cv2.cvtColor(resized, cv2.COLOR_BGR2GRAY)

        # Compute hash
        hash_value = self.hasher.compute(resized)  # This is the pHash np.ndarray
        return hash_value

    def _build_cost_matrix(
        self,
        bg_info: VideoInfo,
        fg_info: VideoInfo,
        bg_indices: list[int],
        fg_indices: list[int],
    ) -> np.ndarray | None:
        """Build cost matrix for dynamic programming alignment.

        Lower cost = better match. Uses perceptual hashes if available.
        """
        n_fg = len(fg_indices)
        n_bg = len(bg_indices)

        # Initialize cost matrix
        cost_matrix = np.full((n_fg, n_bg), np.inf)

        # Use hashes if available
        if (
            self.use_perceptual_hash
            and self.hasher is not None
            and bg_info.path in self.hash_cache
            and fg_info.path in self.hash_cache
        ):
            logger.debug("Building cost matrix using perceptual hashes")

            for i, fg_idx in enumerate(fg_indices):
                fg_hash = self.hash_cache[fg_info.path].get(fg_idx)
                if fg_hash is None:
                    continue

                for j, bg_idx in enumerate(bg_indices):
                    bg_hash = self.hash_cache[bg_info.path].get(bg_idx)
                    if bg_hash is None:
                        continue

                    # Hamming distance as cost
                    distance = cv2.norm(fg_hash, bg_hash, cv2.NORM_HAMMING)
                    cost_matrix[i, j] = distance
        else:
            logger.debug("Building cost matrix using frame extraction")
            # Parallelize frame extraction and comparison
            resize_factor = 0.25

            # Pre-extract all frames in parallel batches
            logger.info("Pre-extracting frames for cost matrix...")

            with ThreadPoolExecutor(max_workers=4) as executor:
                fg_future = executor.submit(
                    self.processor.extract_frames,
                    fg_info.path,
                    fg_indices,
                    resize_factor,
                )
                bg_future = executor.submit(
                    self.processor.extract_frames,
                    bg_info.path,
                    bg_indices,
                    resize_factor,
                )

                fg_frames = fg_future.result()
                bg_frames = bg_future.result()

            # Create frame dictionaries for fast lookup
            fg_frame_dict = {
                idx: frame
                for idx, frame in zip(fg_indices, fg_frames, strict=False)
                if frame is not None
            }
            bg_frame_dict = {
                idx: frame
                for idx, frame in zip(bg_indices, bg_frames, strict=False)
                if frame is not None
            }

            def compute_cell(i, j):
                """Compute single cell of cost matrix."""
                fg_idx = fg_indices[i]
                bg_idx = bg_indices[j]

                if fg_idx not in fg_frame_dict or bg_idx not in bg_frame_dict:
                    return i, j, np.inf

                fg_frame = fg_frame_dict[fg_idx]
                bg_frame = bg_frame_dict[bg_idx]

                # Apply mask if in border mode
                if self._current_mask is not None:
                    similarity = self._compute_frame_similarity(
                        bg_frame, fg_frame, self._current_mask
                    )
                else:
                    similarity = self._compute_frame_similarity(bg_frame, fg_frame)

                cost = 1.0 - similarity

                # Add temporal consistency penalty
                expected_j = int(i * n_bg / n_fg)
                time_penalty = 0.1 * abs(j - expected_j) / n_bg
                cost += time_penalty

                return i, j, cost

            # Process comparisons in parallel
            logger.info(f"Computing {n_fg * n_bg} similarities in parallel...")

            with Progress(
                TextColumn("[progress.description]{task.description}"),
                BarColumn(),
                TimeRemainingColumn(),
                console=console,
                transient=True,
            ) as progress:
                task = progress.add_task(
                    "  Computing similarities...", total=n_fg * n_bg
                )

                with ThreadPoolExecutor(max_workers=mp.cpu_count()) as executor:
                    # Submit all tasks
                    futures = []
                    for i in range(n_fg):
                        for j in range(n_bg):
                            future = executor.submit(compute_cell, i, j)
                            futures.append(future)

                    # Collect results
                    for future in as_completed(futures):
                        i, j, cost = future.result()
                        cost_matrix[i, j] = cost
                        progress.update(task, advance=1)

        return cost_matrix

    def _find_optimal_path(
        self, cost_matrix: np.ndarray, bg_indices: list[int], fg_indices: list[int]
    ) -> list[tuple[int, int, float]]:
        """Find optimal monotonic path through cost matrix using dynamic programming."""
        n_fg, n_bg = cost_matrix.shape

        # Dynamic programming table
        dp = np.full_like(cost_matrix, np.inf)
        parent = np.zeros_like(cost_matrix, dtype=int)

        # Initialize first row - first fg frame can match any bg frame
        dp[0, :] = cost_matrix[0, :]

        # Fill DP table
        for i in range(1, n_fg):
            for j in range(n_bg):
                # Can only come from previous bg frames (monotonic constraint)
                for k in range(j + 1):
                    if dp[i - 1, k] + cost_matrix[i, j] < dp[i, j]:
                        dp[i, j] = dp[i - 1, k] + cost_matrix[i, j]
                        parent[i, j] = k

        # Find best path by backtracking from minimum cost in last row
        min_j = np.argmin(dp[-1, :])
        path = []

        # Backtrack
        j = min_j
        for i in range(n_fg - 1, -1, -1):
            # Convert cost back to similarity
            similarity = 1.0 - cost_matrix[i, j]
            path.append((bg_indices[j], fg_indices[i], similarity))

            if i > 0:
                j = parent[i, j]

        path.reverse()

        # Filter out low-quality matches
        filtered_path = [
            match
            for match in path
            if match[2] > 0.5  # Similarity threshold
        ]

        return filtered_path

    def _refine_matches(
        self,
        initial_matches: list[tuple[int, int, float]],
        bg_info: VideoInfo,
        fg_info: VideoInfo,
        bg_indices: list[int],
        fg_indices: list[int],
    ) -> list[tuple[int, int, float]]:
        """Refine matches by adding intermediate keyframes where needed."""
        if len(initial_matches) < 2:
            return initial_matches

        refined = [initial_matches[0]]

        for i in range(1, len(initial_matches)):
            prev_match = refined[-1]
            curr_match = initial_matches[i]

            # Check if there's a large gap
            fg_gap = curr_match[1] - prev_match[1]
            curr_match[0] - prev_match[0]

            if fg_gap > 50:  # Large gap in foreground frames
                # Add intermediate keyframe
                mid_fg = (prev_match[1] + curr_match[1]) // 2
                mid_bg = (prev_match[0] + curr_match[0]) // 2

                # Find closest sampled indices
                closest_fg = min(fg_indices, key=lambda x: abs(x - mid_fg))
                closest_bg = min(bg_indices, key=lambda x: abs(x - mid_bg))

                # Compute similarity for intermediate frame
                if (
                    self.use_perceptual_hash
                    and fg_info.path in self.hash_cache
                    and bg_info.path in self.hash_cache
                ):
                    fg_hash = self.hash_cache[fg_info.path].get(closest_fg)
                    bg_hash = self.hash_cache[bg_info.path].get(closest_bg)

                    if fg_hash is not None and bg_hash is not None:
                        distance = cv2.norm(fg_hash, bg_hash, cv2.NORM_HAMMING)
                        similarity = 1.0 - (distance / 64.0)

                        if similarity > 0.5:
                            refined.append((closest_bg, closest_fg, similarity))

            refined.append(curr_match)

        return refined

    def _filter_monotonic(
        self, matches: list[tuple[int, int, float]]
    ) -> list[tuple[int, int, float]]:
        """Filter matches to ensure monotonic progression.

        This is now only used as a safety check since the DP algorithm
        already ensures monotonicity.
        """
        if not matches:
            return matches

        # Sort by foreground index
        matches.sort(key=lambda x: x[1])

        # Verify monotonicity (should already be monotonic from DP)
        filtered = []
        last_bg_idx = -1

        for bg_idx, fg_idx, sim in matches:
            if bg_idx > last_bg_idx:
                filtered.append((bg_idx, fg_idx, sim))
                last_bg_idx = bg_idx
            else:
                logger.warning(
                    f"Unexpected non-monotonic match: bg[{bg_idx}] for fg[{fg_idx}]"
                )

        return filtered

    def _build_frame_alignments(
        self,
        bg_info: VideoInfo,
        fg_info: VideoInfo,
        keyframe_matches: list[tuple[int, int, float]],
        trim: bool,
    ) -> list[FrameAlignment]:
        """Build complete frame-to-frame alignment.

        Creates alignment for EVERY foreground frame, finding the optimal
        background frame based on keyframe matches.
        """
        alignments = []

        # Determine range of foreground frames to process
        if trim and keyframe_matches:
            start_fg = keyframe_matches[0][1]
            end_fg = keyframe_matches[-1][1] + 1
        else:
            start_fg = 0
            end_fg = fg_info.frame_count

        logger.info(f"Building alignment for FG frames {start_fg} to {end_fg - 1}")

        # For each foreground frame, find optimal background frame
        for fg_idx in range(start_fg, end_fg):
            bg_idx = self._interpolate_bg_frame(
                fg_idx, keyframe_matches, bg_info, fg_info
            )

            # Ensure bg_idx is valid
            bg_idx = max(0, min(bg_idx, bg_info.frame_count - 1))

            # Estimate similarity based on nearby keyframes
            similarity = self._estimate_similarity(fg_idx, keyframe_matches)

            alignments.append(
                FrameAlignment(
                    fg_frame_idx=fg_idx,
                    bg_frame_idx=bg_idx,
                    similarity_score=similarity,
                )
            )

        logger.info(f"Created {len(alignments)} frame alignments")
        return alignments

    def _interpolate_bg_frame(
        self,
        fg_idx: int,
        keyframe_matches: list[tuple[int, int, float]],
        bg_info: VideoInfo,
        fg_info: VideoInfo,
    ) -> int:
        """Interpolate background frame index for given foreground frame.

        Uses smooth interpolation between keyframe matches to avoid
        sudden jumps or speed changes.
        """
        if not keyframe_matches:
            # Simple ratio-based mapping
            ratio = bg_info.fps / fg_info.fps if fg_info.fps > 0 else 1.0
            return int(fg_idx * ratio)

        # Find surrounding keyframes
        prev_match = None
        next_match = None

        for match in keyframe_matches:
            if match[1] <= fg_idx:
                prev_match = match
            elif match[1] > fg_idx and next_match is None:
                next_match = match
                break

        # Handle edge cases
        if prev_match is None:
            prev_match = keyframe_matches[0]
        if next_match is None:
            next_match = keyframe_matches[-1]

        # If at or beyond edges, extrapolate
        if fg_idx <= prev_match[1]:
            # Before first keyframe
            fps_ratio = bg_info.fps / fg_info.fps if fg_info.fps > 0 else 1.0
            offset = (prev_match[1] - fg_idx) * fps_ratio
            return int(prev_match[0] - offset)

        if fg_idx >= next_match[1]:
            # After last keyframe
            fps_ratio = bg_info.fps / fg_info.fps if fg_info.fps > 0 else 1.0
            offset = (fg_idx - next_match[1]) * fps_ratio
            return int(next_match[0] + offset)

        # Interpolate between keyframes
        if prev_match[1] == next_match[1]:
            return prev_match[0]

        # Calculate position ratio with smooth interpolation
        ratio = (fg_idx - prev_match[1]) / (next_match[1] - prev_match[1])

        # Apply smoothstep for more natural motion
        smooth_ratio = ratio * ratio * (3.0 - 2.0 * ratio)

        # Interpolate background frame
        bg_idx = prev_match[0] + smooth_ratio * (next_match[0] - prev_match[0])

        return int(bg_idx)

    def _estimate_similarity(
        self, fg_idx: int, keyframe_matches: list[tuple[int, int, float]]
    ) -> float:
        """Estimate similarity score for a frame based on nearby keyframes."""
        if not keyframe_matches:
            return 0.5

        # Find closest keyframe
        min_dist = float("inf")
        closest_sim = 0.5

        for _, kf_fg_idx, similarity in keyframe_matches:
            dist = abs(fg_idx - kf_fg_idx)
            if dist < min_dist:
                min_dist = dist
                closest_sim = similarity

        # Decay similarity based on distance
        decay_rate = 0.95**min_dist
        return closest_sim * decay_rate

    def _calculate_alignment_confidence(
        self, keyframe_matches: list[tuple[int, int, float]]
    ) -> float:
        """Calculate overall confidence in the alignment."""
        if not keyframe_matches:
            return 0.0

        # Average similarity of matches
        avg_similarity = sum(m[2] for m in keyframe_matches) / len(keyframe_matches)

        # Coverage (how well distributed the matches are)
        coverage = len(keyframe_matches) / max(len(keyframe_matches), 20)

        return min(1.0, avg_similarity * coverage)

    def _create_direct_mapping(
        self, bg_info: VideoInfo, fg_info: VideoInfo
    ) -> TemporalAlignment:
        """Create simple direct frame mapping as fallback."""
        fps_ratio = bg_info.fps / fg_info.fps if fg_info.fps > 0 else 1.0

        alignments = []
        for fg_idx in range(fg_info.frame_count):
            bg_idx = int(fg_idx * fps_ratio)
            bg_idx = min(bg_idx, bg_info.frame_count - 1)

            alignments.append(
                FrameAlignment(
                    fg_frame_idx=fg_idx, bg_frame_idx=bg_idx, similarity_score=0.5
                )
            )

        return TemporalAlignment(
            offset_seconds=0.0,
            frame_alignments=alignments,
            method_used="direct",
            confidence=0.3,
        )

    def _align_frames_dtw(
        self, bg_info: VideoInfo, fg_info: VideoInfo, trim: bool = False
    ) -> TemporalAlignment:
        """Align videos using Dynamic Time Warping for guaranteed monotonic alignment.

        This is the new improved method that replaces the problematic keyframe matching.

        Why DTW:
        - Guarantees monotonic alignment (no backward jumps)
        - Finds globally optimal path
        - Handles speed variations naturally
        - No more drift or catch-up issues
        """
        logger.info("Using DTW-based temporal alignment")

        # Initialize fingerprinter if needed
        if self.fingerprinter is None:
            try:
                self.fingerprinter = FrameFingerprinter()
            except Exception as e:
                logger.error(f"Failed to initialize fingerprinter: {e}")
                logger.warning("Falling back to classic alignment")
                self.use_dtw = False
                return self.align_frames(bg_info, fg_info, trim)

        # Determine frames to sample
        fg_sample_interval = max(1, fg_info.frame_count // self.max_keyframes)
        bg_sample_interval = max(1, bg_info.frame_count // (self.max_keyframes * 2))

        fg_indices = list(range(0, fg_info.frame_count, fg_sample_interval))
        bg_indices = list(range(0, bg_info.frame_count, bg_sample_interval))

        # Always include first and last frames
        if 0 not in fg_indices:
            fg_indices.insert(0, 0)
        if fg_info.frame_count - 1 not in fg_indices:
            fg_indices.append(fg_info.frame_count - 1)
        if 0 not in bg_indices:
            bg_indices.insert(0, 0)
        if bg_info.frame_count - 1 not in bg_indices:
            bg_indices.append(bg_info.frame_count - 1)

        logger.info(
            f"DTW sampling: {len(fg_indices)} FG frames, {len(bg_indices)} BG frames"
        )

        # Compute fingerprints for sampled frames
        logger.info("Computing frame fingerprints...")

        # Check if we need to use masked fingerprints
        if self._current_mask is not None:
            logger.info("Using masked fingerprints for border mode DTW alignment")
            fg_fingerprints = self._precompute_masked_video_fingerprints(
                fg_info.path, fg_indices, self._current_mask, resize_factor=0.25
            )
            bg_fingerprints = self._precompute_masked_video_fingerprints(
                bg_info.path, bg_indices, self._current_mask, resize_factor=0.25
            )
        else:
            fg_fingerprints = self.fingerprinter.precompute_video_fingerprints(
                fg_info.path, fg_indices, self.processor, resize_factor=0.25
            )
            bg_fingerprints = self.fingerprinter.precompute_video_fingerprints(
                bg_info.path, bg_indices, self.processor, resize_factor=0.25
            )

        if not fg_fingerprints or not bg_fingerprints:
            logger.error("Failed to compute fingerprints")
            logger.warning("Falling back to classic alignment")
            self.use_dtw = False
            return self.align_frames(bg_info, fg_info, trim)

        # Run DTW alignment
        logger.info("Running DTW alignment...")
        dtw_matches = self.dtw_aligner.align_videos(
            fg_fingerprints,
            bg_fingerprints,
            self.fingerprinter.compare_fingerprints,
            show_progress=True,
        )

        if not dtw_matches:
            logger.warning("DTW produced no matches, using direct mapping")
            return self._create_direct_mapping(bg_info, fg_info)

        # Create complete frame alignments from DTW matches
        frame_alignments = self.dtw_aligner.create_frame_alignments(
            dtw_matches, fg_info.frame_count, bg_info.frame_count
        )

        # Apply trimming if requested
        if trim and frame_alignments:
            # Find actual matched range
            bg_indices_used = [a.bg_frame_idx for a in frame_alignments]
            min(bg_indices_used)
            max(bg_indices_used)

            # Trim to only include frames with good matches
            trimmed_alignments = []
            for alignment in frame_alignments:
                if alignment.similarity_score > 0.5:  # Quality threshold
                    trimmed_alignments.append(alignment)

            if trimmed_alignments:
                frame_alignments = trimmed_alignments

        # Calculate overall temporal offset
        if dtw_matches:
            first_match = dtw_matches[0]
            offset_seconds = (first_match[0] / bg_info.fps) - (
                first_match[1] / fg_info.fps
            )
        else:
            offset_seconds = 0.0

        # Calculate overall confidence
        if frame_alignments:
            avg_confidence = sum(a.similarity_score for a in frame_alignments) / len(
                frame_alignments
            )
        else:
            avg_confidence = 0.5

        logger.info(
            f"DTW alignment complete: {len(frame_alignments)} frames, "
            f"offset={offset_seconds:.3f}s, confidence={avg_confidence:.3f}"
        )

        return TemporalAlignment(
            offset_seconds=offset_seconds,
            frame_alignments=frame_alignments,
            method_used="dtw",
            confidence=avg_confidence,
        )

    def create_border_mask(
        self,
        spatial_alignment,
        fg_info: VideoInfo,
        bg_info: VideoInfo,
        border_thickness: int = 8,
    ) -> np.ndarray:
        """Create border mask for border-based temporal alignment.

        The border mask defines the region around the foreground video edges where
        background video is visible. This is used for similarity comparison in border mode.

        Args:
            spatial_alignment: Result from spatial alignment containing x/y offsets
            fg_info: Foreground video information
            bg_info: Background video information
            border_thickness: Thickness of border region in pixels

        Returns:
            Binary mask where 1 indicates border region, 0 indicates non-border
        """
        # Get foreground position on background canvas
        x_offset = spatial_alignment.x_offset
        y_offset = spatial_alignment.y_offset
        fg_width = fg_info.width
        fg_height = fg_info.height
        bg_width = bg_info.width
        bg_height = bg_info.height

        # Create mask same size as background
        mask = np.zeros((bg_height, bg_width), dtype=np.uint8)

        # Define foreground rectangle bounds
        fg_left = x_offset
        fg_right = x_offset + fg_width
        fg_top = y_offset
        fg_bottom = y_offset + fg_height

        # Ensure bounds are within background
        fg_left = max(0, fg_left)
        fg_right = min(bg_width, fg_right)
        fg_top = max(0, fg_top)
        fg_bottom = min(bg_height, fg_bottom)

        # Define border regions based on which edges have visible background

        # Top border (if fg doesn't touch top edge)
        if fg_top > 0:
            border_top = max(0, fg_top - border_thickness)
            mask[border_top:fg_top, fg_left:fg_right] = 1

        # Bottom border (if fg doesn't touch bottom edge)
        if fg_bottom < bg_height:
            border_bottom = min(bg_height, fg_bottom + border_thickness)
            mask[fg_bottom:border_bottom, fg_left:fg_right] = 1

        # Left border (if fg doesn't touch left edge)
        if fg_left > 0:
            border_left = max(0, fg_left - border_thickness)
            mask[fg_top:fg_bottom, border_left:fg_left] = 1

        # Right border (if fg doesn't touch right edge)
        if fg_right < bg_width:
            border_right = min(bg_width, fg_right + border_thickness)
            mask[fg_top:fg_bottom, fg_right:border_right] = 1

        logger.debug(f"Created border mask: {np.sum(mask)} pixels in border region")
        return mask

    def _apply_mask_to_frame(self, frame: np.ndarray, mask: np.ndarray) -> np.ndarray:
        """Apply binary mask to frame, setting non-masked areas to black.

        Args:
            frame: Input frame (H, W, C) or (H, W)
            mask: Binary mask (H, W) where 1 = keep, 0 = zero out

        Returns:
            Masked frame with same dimensions as input
        """
        if len(frame.shape) == 3:
            # Color frame - apply mask to all channels
            masked = frame.copy()
            for c in range(frame.shape[2]):
                masked[:, :, c] = frame[:, :, c] * mask
        else:
            # Grayscale frame
            masked = frame * mask

        return masked

    def create_blend_mask(
        self,
        spatial_alignment,
        fg_info: VideoInfo,
        bg_info: VideoInfo,
        border_thickness: int = 8,
    ) -> np.ndarray:
        """Create blend mask for smooth edge transitions.

        Creates a gradient mask that transitions from fully opaque (1.0) in the center
        of the foreground to fully transparent (0.0) at the edges where background is visible.

        Args:
            spatial_alignment: Result from spatial alignment containing x/y offsets
            fg_info: Foreground video information
            bg_info: Background video information
            border_thickness: Width of gradient transition in pixels

        Returns:
            Float mask with values 0.0-1.0 for alpha blending
        """
        # Get foreground position on background canvas
        x_offset = spatial_alignment.x_offset
        y_offset = spatial_alignment.y_offset
        fg_width = fg_info.width
        fg_height = fg_info.height
        bg_width = bg_info.width
        bg_height = bg_info.height

        # Create mask same size as foreground (will be placed on background)
        mask = np.ones((fg_height, fg_width), dtype=np.float32)

        # Determine which edges need blending (where bg is visible)
        blend_top = y_offset > 0
        blend_bottom = (y_offset + fg_height) < bg_height
        blend_left = x_offset > 0
        blend_right = (x_offset + fg_width) < bg_width

        # Create gradient on edges that need blending
        for y in range(fg_height):
            for x in range(fg_width):
                alpha = 1.0

                # Top edge gradient
                if blend_top and y < border_thickness:
                    alpha = min(alpha, y / border_thickness)

                # Bottom edge gradient
                if blend_bottom and y >= (fg_height - border_thickness):
                    alpha = min(alpha, (fg_height - 1 - y) / border_thickness)

                # Left edge gradient
                if blend_left and x < border_thickness:
                    alpha = min(alpha, x / border_thickness)

                # Right edge gradient
                if blend_right and x >= (fg_width - border_thickness):
                    alpha = min(alpha, (fg_width - 1 - x) / border_thickness)

                mask[y, x] = max(0.0, min(1.0, alpha))

        logger.debug(f"Created blend mask with {border_thickness}px gradient")
        return mask

    def align_frames_with_mask(
        self,
        bg_info: VideoInfo,
        fg_info: VideoInfo,
        trim: bool = False,
        mask: np.ndarray | None = None,
    ) -> TemporalAlignment:
        self._current_mask = mask

        original_use_dtw = self.use_dtw
        original_use_perceptual_hash = self.use_perceptual_hash

        effective_use_dtw = self.use_dtw
        effective_use_perceptual_hash = self.use_perceptual_hash

        if mask is not None:
            logger.info("Border mode active for temporal alignment.")
            if self.use_dtw:
                logger.info(
                    "Border mode with DTW enabled using masked perceptual hashing."
                )
                # DTW now supports masked perceptual hashing
                effective_use_dtw = True

            # In border mode, we can now use masked perceptual hashing
            if not effective_use_dtw and self.use_perceptual_hash:
                logger.info(
                    "Border mode: masked perceptual hashing for faster processing."
                )
                effective_use_perceptual_hash = True
            elif not effective_use_dtw and not self.use_perceptual_hash:
                logger.info(
                    "Border mode: SSIM comparison (perceptual hash unavailable)."
                )
                effective_use_perceptual_hash = False

        alignment_result: TemporalAlignment
        try:
            # Attempt to initialize fingerprinter for DTW if needed and not already done
            if effective_use_dtw and self.fingerprinter is None:
                try:
                    self.fingerprinter = FrameFingerprinter()
                except Exception as e:
                    logger.error(f"Failed to init FrameFingerprinter for DTW: {e}")
                    logger.warning("Falling back to classic alignment.")
                    effective_use_dtw = False
                    self.use_perceptual_hash = original_use_perceptual_hash
                    effective_use_perceptual_hash = original_use_perceptual_hash

            if effective_use_dtw and self.fingerprinter is not None:
                logger.info("Using DTW-based temporal alignment (full frame).")
                alignment_result = self._align_frames_dtw(bg_info, fg_info, trim)
            else:
                log_hash_status = (
                    "Enabled"
                    if self.use_perceptual_hash and self.hasher
                    else "Disabled (SSIM)"
                )
                logger.info(
                    f"Using classic (keyframe-based) temporal alignment. "
                    f"Hash/SSIM: {log_hash_status}."
                )
                keyframe_matches = self._find_keyframe_matches(bg_info, fg_info)

                if not keyframe_matches:
                    logger.warning("No keyframe matches found, using direct mapping.")
                    alignment_result = self._create_direct_mapping(bg_info, fg_info)
                else:
                    frame_alignments = self._build_frame_alignments(
                        bg_info, fg_info, keyframe_matches, trim
                    )
                    first_match = keyframe_matches[0]
                    offset_seconds = (first_match[0] / bg_info.fps) - (
                        first_match[1] / fg_info.fps
                    )
                    confidence = self._calculate_alignment_confidence(keyframe_matches)

                    method_detail = (
                        "hash"
                        if effective_use_perceptual_hash and self.hasher
                        else "SSIM"
                    )
                    base_method_str = "border" if mask is not None else "frames"
                    method_used_str = f"{base_method_str} (classic/{method_detail})"

                    alignment_result = TemporalAlignment(
                        offset_seconds=offset_seconds,
                        frame_alignments=frame_alignments,
                        method_used=method_used_str,
                        confidence=confidence,
                    )
            return alignment_result
        finally:
            self._current_mask = None
            self.use_dtw = original_use_dtw
            self.use_perceptual_hash = original_use_perceptual_hash

    def _find_keyframe_matches_with_mask(
        self, bg_info: VideoInfo, fg_info: VideoInfo, mask: np.ndarray | None = None
    ) -> list[tuple[int, int, float]]:
        """Find matching keyframes between videos with optional mask support."""
        # This is similar to _find_keyframe_matches but uses masked similarity
        # For now, we'll modify the existing method to support masks
        return self._find_keyframe_matches(bg_info, fg_info)
</file>

<file path="TODO.md">
# TODO

Fix this and then try to run `./benchmark.sh` to test your fix. 

```
2025-05-25 23:35:54.196 | WARNING  | vidkompy.core.multi_resolution_aligner:<module>:27 - Numba optimizations not available for multi-resolution alignment
23:35:54 | INFO     | main - CLI options used:
23:35:54 | INFO     | main -   Background video: tests/bg.mp4
23:35:54 | INFO     | main -   Foreground video: tests/fg.mp4
23:35:54 | INFO     | main -   Output path: tests/q-tunnel_mask-w30.mp4
23:35:54 | INFO     | main -   Engine: tunnel_mask
23:35:54 | INFO     | main -   Drift interval: 100
23:35:54 | INFO     | main -   Window: 30
23:35:54 | INFO     | main -   Margin: 8
23:35:54 | INFO     | main -   Smooth blending: False
23:35:54 | INFO     | main -   GPU acceleration: False
23:35:54 | INFO     | main -   Verbose logging: True
23:35:54 | INFO     | __init__ - ✓ Perceptual hashing enabled (pHash)
23:35:54 | INFO     | process - Analyzing videos...
23:35:54 | DEBUG    | get_video_info - Probing video: tests/bg.mp4
23:35:54 | INFO     | get_video_info - Video info for bg.mp4: 1920x1080, 60.00 fps, 7.85s, 472 frames, audio: yes
23:35:54 | DEBUG    | get_video_info - Probing video: tests/fg.mp4
23:35:54 | INFO     | get_video_info - Video info for fg.mp4: 1920x870, 60.89 fps, 8.04s, 483 frames, audio: yes
23:35:54 | INFO     | _log_compatibility - Video compatibility check:
23:35:54 | INFO     | _log_compatibility -   Resolution: 1920x1080 vs 1920x870
23:35:54 | INFO     | _log_compatibility -   FPS: 60.00 vs 60.89
23:35:54 | INFO     | _log_compatibility -   Duration: 7.85s vs 8.04s
23:35:54 | INFO     | _log_compatibility -   Audio: yes vs yes
23:35:54 | INFO     | process - Computing spatial alignment...
23:35:55 | DEBUG    | _template_matching - Using template matching for spatial alignment
23:35:55 | INFO     | _template_matching - Template match found at (0, 0) with confidence 0.941
23:35:55 | INFO     | process - Spatial alignment result: offset=(0, 0), scale=1.000, confidence=0.941
23:35:55 | INFO     | process - Computing temporal alignment...
23:35:55 | INFO     | align_frames - Starting frame-based temporal alignment
23:35:55 | INFO     | align_frames - Using tunnel_mask temporal alignment engine
23:35:55 | INFO     | _align_frames_tunnel - Performing spatial alignment for tunnel engine...
23:35:55 | DEBUG    | _template_matching - Using template matching for spatial alignment
23:35:55 | INFO     | _template_matching - Template match found at (0, 0) with confidence 0.941
23:35:55 | INFO     | _align_frames_tunnel - Spatial offset: (0, 0)
23:35:55 | INFO     | _align_frames_tunnel - Extracting all frames for tunnel alignment...
23:35:55 | INFO     | extract_all_frames - Extracting all 483 frames at 480x217
23:35:56 | INFO     | extract_all_frames - Extracted 483 frames from fg.mp4
23:35:57 | INFO     | extract_all_frames - Extracting all 472 frames at 480x270
23:35:58 | INFO     | extract_all_frames - Extracted 472 frames from bg.mp4
23:35:58 | INFO     | _align_frames_tunnel - Performing tunnel_mask alignment...
23:35:58 | INFO     | align - Generated mask with 95.4% coverage
23:35:58 | INFO     | align - Starting tunnel alignment with 483 FG and 472 BG frames
23:35:58 | INFO     | align - Config: window_size=30, downsample=0.5
23:35:59 | INFO     | align - Downsampled to 483 FG and 472 BG frames
23:35:59 | DEBUG    | _forward_pass - Forward: FG 0 -> BG 0 (diff=11.626)
23:36:02 | DEBUG    | _forward_pass - Forward: FG 100 -> BG 98 (diff=5.191)
23:36:06 | DEBUG    | _forward_pass - Forward: FG 200 -> BG 196 (diff=4.169)
23:36:10 | DEBUG    | _forward_pass - Forward: FG 300 -> BG 298 (diff=6.616)
23:36:14 | DEBUG    | _forward_pass - Forward: FG 400 -> BG 399 (diff=4.957)
23:36:20 | DEBUG    | _backward_pass - Backward: FG 400 -> BG 399 (diff=4.957)
23:36:24 | DEBUG    | _backward_pass - Backward: FG 300 -> BG 298 (diff=6.616)
23:36:29 | DEBUG    | _backward_pass - Backward: FG 200 -> BG 196 (diff=4.169)
23:36:33 | DEBUG    | _backward_pass - Backward: FG 100 -> BG 98 (diff=5.191)
23:36:36 | DEBUG    | _backward_pass - Backward: FG 0 -> BG 0 (diff=11.626)
23:36:36 | INFO     | _merge_mappings - Merge: avg difference = 0.00, confidence = 1.000
23:36:36 | INFO     | align - Cache stats: 0 hits, 0 misses
23:36:36 | INFO     | align - Final confidence: 1.000
23:36:36 | ERROR    | main - Processing failed: 'FrameAlignment' object has no attribute 'bg_frame'
Traceback (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/Users/adam/Developer/vcs/github.twardoch/pub/vidkompy/src/vidkompy/__main__.py", line 15, in <module>
    cli()
  File "/Users/adam/Developer/vcs/github.twardoch/pub/vidkompy/src/vidkompy/__main__.py", line 11, in cli
    fire.Fire(main)
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/fire/core.py", line 135, in Fire
    component_trace = _Fire(component, args, parsed_flag_args, context, name)
                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/fire/core.py", line 468, in _Fire
    component, remaining_args = _CallAndUpdateTrace(
                                ^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/fire/core.py", line 684, in _CallAndUpdateTrace
    component = fn(*varargs, **kwargs)
                ^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/adam/Developer/vcs/github.twardoch/pub/vidkompy/src/vidkompy/vidkompy.py", line 157, in main
    alignment.process(
  File "/Users/adam/Developer/vcs/github.twardoch/pub/vidkompy/src/vidkompy/core/alignment_engine.py", line 138, in process
    temporal_alignment = self._compute_temporal_alignment(
                         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/adam/Developer/vcs/github.twardoch/pub/vidkompy/src/vidkompy/core/alignment_engine.py", line 277, in _compute_temporal_alignment
    return self.temporal_aligner.align_frames(bg_info, fg_info, trim)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/adam/Developer/vcs/github.twardoch/pub/vidkompy/src/vidkompy/core/temporal_alignment.py", line 169, in align_frames
    return self._align_frames_tunnel(bg_info, fg_info, trim)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/adam/Developer/vcs/github.twardoch/pub/vidkompy/src/vidkompy/core/temporal_alignment.py", line 291, in _align_frames_tunnel
    offset_seconds = (first_align.bg_frame / bg_info.fps) - (first_align.fg_frame / fg_info.fps)
                      ^^^^^^^^^^^^^^^^^^^^
AttributeError: 'FrameAlignment' object has no attribute 'bg_frame'

```
</file>

</files>
