"""Utility functions for generating URL-safe keys from folder names. This module provides key generation and normalization for anime series, handling edge cases like non-Latin characters and special symbols. """ from __future__ import annotations import re import unicodedata import uuid from typing import Optional # Valid key pattern: alphanumeric, hyphens, underscores # Must be at least 1 char, URL-safe VALID_KEY_PATTERN = re.compile(r'^[a-zA-Z0-9][a-zA-Z0-9_-]*$') def normalize_key(key: str) -> str: """Normalize a key to a URL-safe format. Args: key: The key to normalize Returns: Normalized lowercase key with spaces replaced by hyphens """ if not key: return "" # Convert to lowercase normalized = key.lower() # Replace spaces and underscores with hyphens normalized = re.sub(r'[\s_]+', '-', normalized) # Remove any characters that aren't alphanumeric or hyphens normalized = re.sub(r'[^a-z0-9-]', '', normalized) # Collapse multiple consecutive hyphens normalized = re.sub(r'-+', '-', normalized) # Remove leading/trailing hyphens normalized = normalized.strip('-') return normalized def is_valid_key(key: str) -> bool: """Check if a key is valid for URL-safe use. Args: key: The key to validate Returns: True if key is valid (non-empty, URL-safe, alphanumeric start/end, min 2 chars) """ if not key or not key.strip(): return False if len(key) < 2: return False return bool(VALID_KEY_PATTERN.match(key)) def generate_key_from_folder(folder_name: str) -> str: """Generate a URL-safe key from a folder name. Handles edge cases: - Non-Latin characters (Japanese, Chinese, etc.) - Special characters - All-invalid names that normalize to empty Args: folder_name: The folder name to convert to a key Returns: A URL-safe key string. Never returns empty string. Examples: >>> generate_key_from_folder("Attack on Titan (2013)") 'attack-on-titan-2013' >>> generate_key_from_folder("A Time Called You (2023)") 'a-time-called-you-2023' >>> generate_key_from_folder("25-sai no Joshikousei (2018)") '25-sai-no-joshikousei-2018' """ if not folder_name or not folder_name.strip(): raise ValueError("Folder name cannot be empty") # Step 1: Unicode NFC normalization (preserves international chars) normalized = unicodedata.normalize('NFC', folder_name.strip()) # Step 2: Extract alphanumeric parts, preserving international chars # This keeps Japanese/Chinese characters but removes special symbols parts = [] for char in normalized: # Keep Unicode alphanumeric characters (letters/numbers from any script) if char.isalnum(): parts.append(char) elif char.isspace(): parts.append(' ') # Handle apostrophes - treat as part of word (remove, don't replace with space) # This normalizes e.g., "Hell's" -> "Hells" # Includes: ' (0x27), ' (0x2018), ' (0x2019), ' (0x02BC), ` (0x0060) elif char in ("'", "'", "'", "'", "`", """, """): pass # Skip - drop the apostrophe else: parts.append(' ') working = ''.join(parts) # Step 3: Split into words and normalize each words = working.split() # Step 4: Convert to lowercase and create hyphenated key key = '-'.join(word.lower() for word in words if word) # Step 5: If we got a valid key, return it if key and is_valid_key(key): return key # Step 6: Try just alphanumeric characters alphanumeric_only = re.sub(r'[^a-zA-Z0-9\s]', '', working) words = alphanumeric_only.split() key = '-'.join(word.lower() for word in words if word) if key and is_valid_key(key): return key # Step 7: Last resort - use folder name directly with transliteration # Try to convert non-ASCII to ASCII equivalents try: # Use NFD normalization and strip combining characters # This effectively Latinizes some characters nfd_form = unicodedata.normalize('NFD', folder_name) latinized = ''.join( char for char in nfd_form if unicodedata.category(char) != 'Mn' # Strip combining marks ) # Remove non-ASCII letters latinized = re.sub(r'[^a-zA-Z0-9\s]', ' ', latinized) words = latinized.split() key = '-'.join(word.lower() for word in words if word) if key and is_valid_key(key): return key except Exception: pass # Step 8: Absolute fallback - generate UUID-based key # Use first 8 chars of UUID for brevity uuid_key = uuid.uuid4().hex[:8] # Try to extract any meaningful words from the original name meaningful_parts = [] for char in folder_name: if char.isalnum(): meaningful_parts.append(char.lower()) elif len(meaningful_parts) > 0: meaningful_parts.append('-') fallback_base = ''.join(meaningful_parts).strip('-') if fallback_base and len(fallback_base) >= 2: # Combine meaningful parts with UUID for uniqueness # Truncate meaningful parts if too long if len(fallback_base) > 20: fallback_base = fallback_base[:20] return f"{fallback_base}-{uuid_key}" return f"series-{uuid_key}" def validate_key_uniqueness( key: str, existing_keys: set[str], ) -> tuple[bool, str]: """Validate that a key is unique among existing keys. Args: key: The key to validate existing_keys: Set of keys that already exist Returns: Tuple of (is_valid, error_message) """ if not key or not key.strip(): return False, "Key cannot be empty" stripped = key.strip() if len(stripped) < 2: return False, "Key must be at least 2 characters" if not is_valid_key(stripped): return False, "Key must be URL-safe (alphanumeric, hyphens, underscores only)" if stripped in existing_keys: return False, f"Key '{stripped}' is already in use" return True, "" def sanitize_key_for_url(key: str) -> str: """Sanitize a key for safe URL usage. Args: key: The key to sanitize Returns: URL-safe version of the key """ if not key: return "" # Replace spaces with hyphens first sanitized = key.replace(' ', '-') # Remove any characters that could cause URL issues (keep alphanumerics, hyphens, underscores) sanitized = re.sub(r'[^\w\-]', '', sanitized) # Collapse multiple hyphens sanitized = re.sub(r'-+', '-', sanitized) return sanitized.strip('-') def sanitize_url_for_logging(url: str, max_length: int = 100) -> str: """Sanitize a URL for safe logging by removing sensitive query parameters. Removes or truncates query parameters that may contain tokens, keys, or other sensitive data while preserving enough structure for debugging. Args: url: The URL to sanitize max_length: Maximum length of the returned URL string Returns: Sanitized URL safe for logging """ if not url: return "" # Truncate if too long if len(url) > max_length: return url[:max_length] + "..." return url