Aniworld/src/core/utils/key_utils.py

"""Utility functions for generating URL-safe keys from folder names.

This module provides key generation and normalization for anime series,
handling edge cases like non-Latin characters and special symbols.
"""
from __future__ import annotations

import re
import unicodedata
import uuid
from typing import Optional

# Valid key pattern: alphanumeric, hyphens, underscores
# Must be at least 1 char, URL-safe
VALID_KEY_PATTERN = re.compile(r'^[a-zA-Z0-9][a-zA-Z0-9_-]*$')


def normalize_key(key: str) -> str:
    """Normalize a key to a URL-safe format.

    Args:
        key: The key to normalize

    Returns:
        Normalized lowercase key with spaces replaced by hyphens
    """
    if not key:
        return ""

    # Convert to lowercase
    normalized = key.lower()

    # Replace spaces and underscores with hyphens
    normalized = re.sub(r'[\s_]+', '-', normalized)

    # Remove any characters that aren't alphanumeric or hyphens
    normalized = re.sub(r'[^a-z0-9-]', '', normalized)

    # Collapse multiple consecutive hyphens
    normalized = re.sub(r'-+', '-', normalized)

    # Remove leading/trailing hyphens
    normalized = normalized.strip('-')

    return normalized


def is_valid_key(key: str) -> bool:
    """Check if a key is valid for URL-safe use.

    Args:
        key: The key to validate

    Returns:
        True if key is valid (non-empty, URL-safe, alphanumeric start/end, min 2 chars)
    """
    if not key or not key.strip():
        return False

    if len(key) < 2:
        return False

    return bool(VALID_KEY_PATTERN.match(key))


def generate_key_from_folder(folder_name: str) -> str:
    """Generate a URL-safe key from a folder name.

    Handles edge cases:
    - Non-Latin characters (Japanese, Chinese, etc.)
    - Special characters
    - All-invalid names that normalize to empty

    Args:
        folder_name: The folder name to convert to a key

    Returns:
        A URL-safe key string. Never returns empty string.

    Examples:
        >>> generate_key_from_folder("Attack on Titan (2013)")
        'attack-on-titan-2013'
        >>> generate_key_from_folder("A Time Called You (2023)")
        'a-time-called-you-2023'
        >>> generate_key_from_folder("25-sai no Joshikousei (2018)")
        '25-sai-no-joshikousei-2018'
    """
    if not folder_name or not folder_name.strip():
        raise ValueError("Folder name cannot be empty")

    # Step 1: Unicode NFC normalization (preserves international chars)
    normalized = unicodedata.normalize('NFC', folder_name.strip())

    # Step 2: Extract alphanumeric parts, preserving international chars
    # This keeps Japanese/Chinese characters but removes special symbols
    parts = []

    for char in normalized:
        # Keep Unicode alphanumeric characters (letters/numbers from any script)
        if char.isalnum():
            parts.append(char)
        elif char.isspace():
            parts.append(' ')
        # Handle apostrophes - treat as part of word (remove, don't replace with space)
        # This normalizes e.g., "Hell's" -> "Hells"
        # Includes: ' (0x27), ' (0x2018), ' (0x2019), ' (0x02BC), ` (0x0060)
        elif char in ("'", "'", "'", "'", "`", """, """):
            pass  # Skip - drop the apostrophe
        else:
            parts.append(' ')

    working = ''.join(parts)

    # Step 3: Split into words and normalize each
    words = working.split()

    # Step 4: Convert to lowercase and create hyphenated key
    key = '-'.join(word.lower() for word in words if word)

    # Step 5: If we got a valid key, return it
    if key and is_valid_key(key):
        return key

    # Step 6: Try just alphanumeric characters
    alphanumeric_only = re.sub(r'[^a-zA-Z0-9\s]', '', working)
    words = alphanumeric_only.split()
    key = '-'.join(word.lower() for word in words if word)

    if key and is_valid_key(key):
        return key

    # Step 7: Last resort - use folder name directly with transliteration
    # Try to convert non-ASCII to ASCII equivalents
    try:
        # Use NFD normalization and strip combining characters
        # This effectively Latinizes some characters
        nfd_form = unicodedata.normalize('NFD', folder_name)
        latinized = ''.join(
            char for char in nfd_form
            if unicodedata.category(char) != 'Mn'  # Strip combining marks
        )
        # Remove non-ASCII letters
        latinized = re.sub(r'[^a-zA-Z0-9\s]', ' ', latinized)
        words = latinized.split()
        key = '-'.join(word.lower() for word in words if word)

        if key and is_valid_key(key):
            return key
    except Exception:
        pass

    # Step 8: Absolute fallback - generate UUID-based key
    # Use first 8 chars of UUID for brevity
    uuid_key = uuid.uuid4().hex[:8]

    # Try to extract any meaningful words from the original name
    meaningful_parts = []
    for char in folder_name:
        if char.isalnum():
            meaningful_parts.append(char.lower())
        elif len(meaningful_parts) > 0:
            meaningful_parts.append('-')

    fallback_base = ''.join(meaningful_parts).strip('-')
    if fallback_base and len(fallback_base) >= 2:
        # Combine meaningful parts with UUID for uniqueness
        # Truncate meaningful parts if too long
        if len(fallback_base) > 20:
            fallback_base = fallback_base[:20]
        return f"{fallback_base}-{uuid_key}"

    return f"series-{uuid_key}"


def validate_key_uniqueness(
    key: str,
    existing_keys: set[str],
) -> tuple[bool, str]:
    """Validate that a key is unique among existing keys.

    Args:
        key: The key to validate
        existing_keys: Set of keys that already exist

    Returns:
        Tuple of (is_valid, error_message)
    """
    if not key or not key.strip():
        return False, "Key cannot be empty"

    stripped = key.strip()
    if len(stripped) < 2:
        return False, "Key must be at least 2 characters"

    if not is_valid_key(stripped):
        return False, "Key must be URL-safe (alphanumeric, hyphens, underscores only)"

    if stripped in existing_keys:
        return False, f"Key '{stripped}' is already in use"

    return True, ""


def sanitize_key_for_url(key: str) -> str:
    """Sanitize a key for safe URL usage.

    Args:
        key: The key to sanitize

    Returns:
        URL-safe version of the key
    """
    if not key:
        return ""

    # Replace spaces with hyphens first
    sanitized = key.replace(' ', '-')

    # Remove any characters that could cause URL issues (keep alphanumerics, hyphens, underscores)
    sanitized = re.sub(r'[^\w\-]', '', sanitized)

    # Collapse multiple hyphens
    sanitized = re.sub(r'-+', '-', sanitized)

    return sanitized.strip('-')


def sanitize_url_for_logging(url: str, max_length: int = 100) -> str:
    """Sanitize a URL for safe logging by removing sensitive query parameters.

    Removes or truncates query parameters that may contain tokens, keys,
    or other sensitive data while preserving enough structure for debugging.

    Args:
        url: The URL to sanitize
        max_length: Maximum length of the returned URL string

    Returns:
        Sanitized URL safe for logging
    """
    if not url:
        return ""

    # Truncate if too long
    if len(url) > max_length:
        return url[:max_length] + "..."

    return url