Files
Aniworld/src/server/utils/key_utils.py
Lukas 5526ab884a refactor: restructure core→server, split large entity files into database module
- Move src/core/ → src/server/
- Split SerieList.py (531 lines) and series.py (414 lines) into src/server/database/
- Add database/models.py for SQLAlchemy models
- Update all test imports to reflect new structure
- Remove deprecated test files (test_serie_class.py, test_serie_folder_with_year.py)
2026-06-04 21:11:53 +02:00

249 lines
7.2 KiB
Python

"""Utility functions for generating URL-safe keys from folder names.
This module provides key generation and normalization for anime series,
handling edge cases like non-Latin characters and special symbols.
"""
from __future__ import annotations
import re
import unicodedata
import uuid
from typing import Optional
# Valid key pattern: alphanumeric, hyphens, underscores
# Must be at least 1 char, URL-safe
VALID_KEY_PATTERN = re.compile(r'^[a-zA-Z0-9][a-zA-Z0-9_-]*$')
def normalize_key(key: str) -> str:
"""Normalize a key to a URL-safe format.
Args:
key: The key to normalize
Returns:
Normalized lowercase key with spaces replaced by hyphens
"""
if not key:
return ""
# Convert to lowercase
normalized = key.lower()
# Replace spaces and underscores with hyphens
normalized = re.sub(r'[\s_]+', '-', normalized)
# Remove any characters that aren't alphanumeric or hyphens
normalized = re.sub(r'[^a-z0-9-]', '', normalized)
# Collapse multiple consecutive hyphens
normalized = re.sub(r'-+', '-', normalized)
# Remove leading/trailing hyphens
normalized = normalized.strip('-')
return normalized
def is_valid_key(key: str) -> bool:
"""Check if a key is valid for URL-safe use.
Args:
key: The key to validate
Returns:
True if key is valid (non-empty, URL-safe, alphanumeric start/end, min 2 chars)
"""
if not key or not key.strip():
return False
if len(key) < 2:
return False
return bool(VALID_KEY_PATTERN.match(key))
def generate_key_from_folder(folder_name: str) -> str:
"""Generate a URL-safe key from a folder name.
Handles edge cases:
- Non-Latin characters (Japanese, Chinese, etc.)
- Special characters
- All-invalid names that normalize to empty
Args:
folder_name: The folder name to convert to a key
Returns:
A URL-safe key string. Never returns empty string.
Examples:
>>> generate_key_from_folder("Attack on Titan (2013)")
'attack-on-titan-2013'
>>> generate_key_from_folder("A Time Called You (2023)")
'a-time-called-you-2023'
>>> generate_key_from_folder("25-sai no Joshikousei (2018)")
'25-sai-no-joshikousei-2018'
"""
if not folder_name or not folder_name.strip():
raise ValueError("Folder name cannot be empty")
# Step 1: Unicode NFC normalization (preserves international chars)
normalized = unicodedata.normalize('NFC', folder_name.strip())
# Step 2: Extract alphanumeric parts, preserving international chars
# This keeps Japanese/Chinese characters but removes special symbols
parts = []
for char in normalized:
# Keep Unicode alphanumeric characters (letters/numbers from any script)
if char.isalnum():
parts.append(char)
elif char.isspace():
parts.append(' ')
# Handle apostrophes - treat as part of word (remove, don't replace with space)
# This normalizes e.g., "Hell's" -> "Hells"
# Includes: ' (0x27), ' (0x2018), ' (0x2019), ' (0x02BC), ` (0x0060)
elif char in ("'", "'", "'", "'", "`", """, """):
pass # Skip - drop the apostrophe
else:
parts.append(' ')
working = ''.join(parts)
# Step 3: Split into words and normalize each
words = working.split()
# Step 4: Convert to lowercase and create hyphenated key
key = '-'.join(word.lower() for word in words if word)
# Step 5: If we got a valid key, return it
if key and is_valid_key(key):
return key
# Step 6: Try just alphanumeric characters
alphanumeric_only = re.sub(r'[^a-zA-Z0-9\s]', '', working)
words = alphanumeric_only.split()
key = '-'.join(word.lower() for word in words if word)
if key and is_valid_key(key):
return key
# Step 7: Last resort - use folder name directly with transliteration
# Try to convert non-ASCII to ASCII equivalents
try:
# Use NFD normalization and strip combining characters
# This effectively Latinizes some characters
nfd_form = unicodedata.normalize('NFD', folder_name)
latinized = ''.join(
char for char in nfd_form
if unicodedata.category(char) != 'Mn' # Strip combining marks
)
# Remove non-ASCII letters
latinized = re.sub(r'[^a-zA-Z0-9\s]', ' ', latinized)
words = latinized.split()
key = '-'.join(word.lower() for word in words if word)
if key and is_valid_key(key):
return key
except Exception:
pass
# Step 8: Absolute fallback - generate UUID-based key
# Use first 8 chars of UUID for brevity
uuid_key = uuid.uuid4().hex[:8]
# Try to extract any meaningful words from the original name
meaningful_parts = []
for char in folder_name:
if char.isalnum():
meaningful_parts.append(char.lower())
elif len(meaningful_parts) > 0:
meaningful_parts.append('-')
fallback_base = ''.join(meaningful_parts).strip('-')
if fallback_base and len(fallback_base) >= 2:
# Combine meaningful parts with UUID for uniqueness
# Truncate meaningful parts if too long
if len(fallback_base) > 20:
fallback_base = fallback_base[:20]
return f"{fallback_base}-{uuid_key}"
return f"series-{uuid_key}"
def validate_key_uniqueness(
key: str,
existing_keys: set[str],
) -> tuple[bool, str]:
"""Validate that a key is unique among existing keys.
Args:
key: The key to validate
existing_keys: Set of keys that already exist
Returns:
Tuple of (is_valid, error_message)
"""
if not key or not key.strip():
return False, "Key cannot be empty"
stripped = key.strip()
if len(stripped) < 2:
return False, "Key must be at least 2 characters"
if not is_valid_key(stripped):
return False, "Key must be URL-safe (alphanumeric, hyphens, underscores only)"
if stripped in existing_keys:
return False, f"Key '{stripped}' is already in use"
return True, ""
def sanitize_key_for_url(key: str) -> str:
"""Sanitize a key for safe URL usage.
Args:
key: The key to sanitize
Returns:
URL-safe version of the key
"""
if not key:
return ""
# Replace spaces with hyphens first
sanitized = key.replace(' ', '-')
# Remove any characters that could cause URL issues (keep alphanumerics, hyphens, underscores)
sanitized = re.sub(r'[^\w\-]', '', sanitized)
# Collapse multiple hyphens
sanitized = re.sub(r'-+', '-', sanitized)
return sanitized.strip('-')
def sanitize_url_for_logging(url: str, max_length: int = 100) -> str:
"""Sanitize a URL for safe logging by removing sensitive query parameters.
Removes or truncates query parameters that may contain tokens, keys,
or other sensitive data while preserving enough structure for debugging.
Args:
url: The URL to sanitize
max_length: Maximum length of the returned URL string
Returns:
Sanitized URL safe for logging
"""
if not url:
return ""
# Truncate if too long
if len(url) > max_length:
return url[:max_length] + "..."
return url