249 lines
7.2 KiB
Python
249 lines
7.2 KiB
Python
"""Utility functions for generating URL-safe keys from folder names.
|
|
|
|
This module provides key generation and normalization for anime series,
|
|
handling edge cases like non-Latin characters and special symbols.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import re
|
|
import unicodedata
|
|
import uuid
|
|
from typing import Optional
|
|
|
|
# Valid key pattern: alphanumeric, hyphens, underscores
|
|
# Must be at least 1 char, URL-safe
|
|
VALID_KEY_PATTERN = re.compile(r'^[a-zA-Z0-9][a-zA-Z0-9_-]*$')
|
|
|
|
|
|
def normalize_key(key: str) -> str:
|
|
"""Normalize a key to a URL-safe format.
|
|
|
|
Args:
|
|
key: The key to normalize
|
|
|
|
Returns:
|
|
Normalized lowercase key with spaces replaced by hyphens
|
|
"""
|
|
if not key:
|
|
return ""
|
|
|
|
# Convert to lowercase
|
|
normalized = key.lower()
|
|
|
|
# Replace spaces and underscores with hyphens
|
|
normalized = re.sub(r'[\s_]+', '-', normalized)
|
|
|
|
# Remove any characters that aren't alphanumeric or hyphens
|
|
normalized = re.sub(r'[^a-z0-9-]', '', normalized)
|
|
|
|
# Collapse multiple consecutive hyphens
|
|
normalized = re.sub(r'-+', '-', normalized)
|
|
|
|
# Remove leading/trailing hyphens
|
|
normalized = normalized.strip('-')
|
|
|
|
return normalized
|
|
|
|
|
|
def is_valid_key(key: str) -> bool:
|
|
"""Check if a key is valid for URL-safe use.
|
|
|
|
Args:
|
|
key: The key to validate
|
|
|
|
Returns:
|
|
True if key is valid (non-empty, URL-safe, alphanumeric start/end, min 2 chars)
|
|
"""
|
|
if not key or not key.strip():
|
|
return False
|
|
|
|
if len(key) < 2:
|
|
return False
|
|
|
|
return bool(VALID_KEY_PATTERN.match(key))
|
|
|
|
|
|
def generate_key_from_folder(folder_name: str) -> str:
|
|
"""Generate a URL-safe key from a folder name.
|
|
|
|
Handles edge cases:
|
|
- Non-Latin characters (Japanese, Chinese, etc.)
|
|
- Special characters
|
|
- All-invalid names that normalize to empty
|
|
|
|
Args:
|
|
folder_name: The folder name to convert to a key
|
|
|
|
Returns:
|
|
A URL-safe key string. Never returns empty string.
|
|
|
|
Examples:
|
|
>>> generate_key_from_folder("Attack on Titan (2013)")
|
|
'attack-on-titan-2013'
|
|
>>> generate_key_from_folder("A Time Called You (2023)")
|
|
'a-time-called-you-2023'
|
|
>>> generate_key_from_folder("25-sai no Joshikousei (2018)")
|
|
'25-sai-no-joshikousei-2018'
|
|
"""
|
|
if not folder_name or not folder_name.strip():
|
|
raise ValueError("Folder name cannot be empty")
|
|
|
|
# Step 1: Unicode NFC normalization (preserves international chars)
|
|
normalized = unicodedata.normalize('NFC', folder_name.strip())
|
|
|
|
# Step 2: Extract alphanumeric parts, preserving international chars
|
|
# This keeps Japanese/Chinese characters but removes special symbols
|
|
parts = []
|
|
|
|
for char in normalized:
|
|
# Keep Unicode alphanumeric characters (letters/numbers from any script)
|
|
if char.isalnum():
|
|
parts.append(char)
|
|
elif char.isspace():
|
|
parts.append(' ')
|
|
# Handle apostrophes - treat as part of word (remove, don't replace with space)
|
|
# This normalizes e.g., "Hell's" -> "Hells"
|
|
# Includes: ' (0x27), ' (0x2018), ' (0x2019), ' (0x02BC), ` (0x0060)
|
|
elif char in ("'", "'", "'", "'", "`", """, """):
|
|
pass # Skip - drop the apostrophe
|
|
else:
|
|
parts.append(' ')
|
|
|
|
working = ''.join(parts)
|
|
|
|
# Step 3: Split into words and normalize each
|
|
words = working.split()
|
|
|
|
# Step 4: Convert to lowercase and create hyphenated key
|
|
key = '-'.join(word.lower() for word in words if word)
|
|
|
|
# Step 5: If we got a valid key, return it
|
|
if key and is_valid_key(key):
|
|
return key
|
|
|
|
# Step 6: Try just alphanumeric characters
|
|
alphanumeric_only = re.sub(r'[^a-zA-Z0-9\s]', '', working)
|
|
words = alphanumeric_only.split()
|
|
key = '-'.join(word.lower() for word in words if word)
|
|
|
|
if key and is_valid_key(key):
|
|
return key
|
|
|
|
# Step 7: Last resort - use folder name directly with transliteration
|
|
# Try to convert non-ASCII to ASCII equivalents
|
|
try:
|
|
# Use NFD normalization and strip combining characters
|
|
# This effectively Latinizes some characters
|
|
nfd_form = unicodedata.normalize('NFD', folder_name)
|
|
latinized = ''.join(
|
|
char for char in nfd_form
|
|
if unicodedata.category(char) != 'Mn' # Strip combining marks
|
|
)
|
|
# Remove non-ASCII letters
|
|
latinized = re.sub(r'[^a-zA-Z0-9\s]', ' ', latinized)
|
|
words = latinized.split()
|
|
key = '-'.join(word.lower() for word in words if word)
|
|
|
|
if key and is_valid_key(key):
|
|
return key
|
|
except Exception:
|
|
pass
|
|
|
|
# Step 8: Absolute fallback - generate UUID-based key
|
|
# Use first 8 chars of UUID for brevity
|
|
uuid_key = uuid.uuid4().hex[:8]
|
|
|
|
# Try to extract any meaningful words from the original name
|
|
meaningful_parts = []
|
|
for char in folder_name:
|
|
if char.isalnum():
|
|
meaningful_parts.append(char.lower())
|
|
elif len(meaningful_parts) > 0:
|
|
meaningful_parts.append('-')
|
|
|
|
fallback_base = ''.join(meaningful_parts).strip('-')
|
|
if fallback_base and len(fallback_base) >= 2:
|
|
# Combine meaningful parts with UUID for uniqueness
|
|
# Truncate meaningful parts if too long
|
|
if len(fallback_base) > 20:
|
|
fallback_base = fallback_base[:20]
|
|
return f"{fallback_base}-{uuid_key}"
|
|
|
|
return f"series-{uuid_key}"
|
|
|
|
|
|
def validate_key_uniqueness(
|
|
key: str,
|
|
existing_keys: set[str],
|
|
) -> tuple[bool, str]:
|
|
"""Validate that a key is unique among existing keys.
|
|
|
|
Args:
|
|
key: The key to validate
|
|
existing_keys: Set of keys that already exist
|
|
|
|
Returns:
|
|
Tuple of (is_valid, error_message)
|
|
"""
|
|
if not key or not key.strip():
|
|
return False, "Key cannot be empty"
|
|
|
|
stripped = key.strip()
|
|
if len(stripped) < 2:
|
|
return False, "Key must be at least 2 characters"
|
|
|
|
if not is_valid_key(stripped):
|
|
return False, "Key must be URL-safe (alphanumeric, hyphens, underscores only)"
|
|
|
|
if stripped in existing_keys:
|
|
return False, f"Key '{stripped}' is already in use"
|
|
|
|
return True, ""
|
|
|
|
|
|
def sanitize_key_for_url(key: str) -> str:
|
|
"""Sanitize a key for safe URL usage.
|
|
|
|
Args:
|
|
key: The key to sanitize
|
|
|
|
Returns:
|
|
URL-safe version of the key
|
|
"""
|
|
if not key:
|
|
return ""
|
|
|
|
# Replace spaces with hyphens first
|
|
sanitized = key.replace(' ', '-')
|
|
|
|
# Remove any characters that could cause URL issues (keep alphanumerics, hyphens, underscores)
|
|
sanitized = re.sub(r'[^\w\-]', '', sanitized)
|
|
|
|
# Collapse multiple hyphens
|
|
sanitized = re.sub(r'-+', '-', sanitized)
|
|
|
|
return sanitized.strip('-')
|
|
|
|
|
|
def sanitize_url_for_logging(url: str, max_length: int = 100) -> str:
|
|
"""Sanitize a URL for safe logging by removing sensitive query parameters.
|
|
|
|
Removes or truncates query parameters that may contain tokens, keys,
|
|
or other sensitive data while preserving enough structure for debugging.
|
|
|
|
Args:
|
|
url: The URL to sanitize
|
|
max_length: Maximum length of the returned URL string
|
|
|
|
Returns:
|
|
Sanitized URL safe for logging
|
|
"""
|
|
if not url:
|
|
return ""
|
|
|
|
# Truncate if too long
|
|
if len(url) > max_length:
|
|
return url[:max_length] + "..."
|
|
|
|
return url
|