refactor: restructure core→server, split large entity files into database module
- Move src/core/ → src/server/ - Split SerieList.py (531 lines) and series.py (414 lines) into src/server/database/ - Add database/models.py for SQLAlchemy models - Update all test imports to reflect new structure - Remove deprecated test files (test_serie_class.py, test_serie_folder_with_year.py)
This commit is contained in:
248
src/server/utils/key_utils.py
Normal file
248
src/server/utils/key_utils.py
Normal file
@@ -0,0 +1,248 @@
|
||||
"""Utility functions for generating URL-safe keys from folder names.
|
||||
|
||||
This module provides key generation and normalization for anime series,
|
||||
handling edge cases like non-Latin characters and special symbols.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
import unicodedata
|
||||
import uuid
|
||||
from typing import Optional
|
||||
|
||||
# Valid key pattern: alphanumeric, hyphens, underscores
|
||||
# Must be at least 1 char, URL-safe
|
||||
VALID_KEY_PATTERN = re.compile(r'^[a-zA-Z0-9][a-zA-Z0-9_-]*$')
|
||||
|
||||
|
||||
def normalize_key(key: str) -> str:
|
||||
"""Normalize a key to a URL-safe format.
|
||||
|
||||
Args:
|
||||
key: The key to normalize
|
||||
|
||||
Returns:
|
||||
Normalized lowercase key with spaces replaced by hyphens
|
||||
"""
|
||||
if not key:
|
||||
return ""
|
||||
|
||||
# Convert to lowercase
|
||||
normalized = key.lower()
|
||||
|
||||
# Replace spaces and underscores with hyphens
|
||||
normalized = re.sub(r'[\s_]+', '-', normalized)
|
||||
|
||||
# Remove any characters that aren't alphanumeric or hyphens
|
||||
normalized = re.sub(r'[^a-z0-9-]', '', normalized)
|
||||
|
||||
# Collapse multiple consecutive hyphens
|
||||
normalized = re.sub(r'-+', '-', normalized)
|
||||
|
||||
# Remove leading/trailing hyphens
|
||||
normalized = normalized.strip('-')
|
||||
|
||||
return normalized
|
||||
|
||||
|
||||
def is_valid_key(key: str) -> bool:
|
||||
"""Check if a key is valid for URL-safe use.
|
||||
|
||||
Args:
|
||||
key: The key to validate
|
||||
|
||||
Returns:
|
||||
True if key is valid (non-empty, URL-safe, alphanumeric start/end, min 2 chars)
|
||||
"""
|
||||
if not key or not key.strip():
|
||||
return False
|
||||
|
||||
if len(key) < 2:
|
||||
return False
|
||||
|
||||
return bool(VALID_KEY_PATTERN.match(key))
|
||||
|
||||
|
||||
def generate_key_from_folder(folder_name: str) -> str:
|
||||
"""Generate a URL-safe key from a folder name.
|
||||
|
||||
Handles edge cases:
|
||||
- Non-Latin characters (Japanese, Chinese, etc.)
|
||||
- Special characters
|
||||
- All-invalid names that normalize to empty
|
||||
|
||||
Args:
|
||||
folder_name: The folder name to convert to a key
|
||||
|
||||
Returns:
|
||||
A URL-safe key string. Never returns empty string.
|
||||
|
||||
Examples:
|
||||
>>> generate_key_from_folder("Attack on Titan (2013)")
|
||||
'attack-on-titan-2013'
|
||||
>>> generate_key_from_folder("A Time Called You (2023)")
|
||||
'a-time-called-you-2023'
|
||||
>>> generate_key_from_folder("25-sai no Joshikousei (2018)")
|
||||
'25-sai-no-joshikousei-2018'
|
||||
"""
|
||||
if not folder_name or not folder_name.strip():
|
||||
raise ValueError("Folder name cannot be empty")
|
||||
|
||||
# Step 1: Unicode NFC normalization (preserves international chars)
|
||||
normalized = unicodedata.normalize('NFC', folder_name.strip())
|
||||
|
||||
# Step 2: Extract alphanumeric parts, preserving international chars
|
||||
# This keeps Japanese/Chinese characters but removes special symbols
|
||||
parts = []
|
||||
|
||||
for char in normalized:
|
||||
# Keep Unicode alphanumeric characters (letters/numbers from any script)
|
||||
if char.isalnum():
|
||||
parts.append(char)
|
||||
elif char.isspace():
|
||||
parts.append(' ')
|
||||
# Handle apostrophes - treat as part of word (remove, don't replace with space)
|
||||
# This normalizes e.g., "Hell's" -> "Hells"
|
||||
# Includes: ' (0x27), ' (0x2018), ' (0x2019), ' (0x02BC), ` (0x0060)
|
||||
elif char in ("'", "'", "'", "'", "`", """, """):
|
||||
pass # Skip - drop the apostrophe
|
||||
else:
|
||||
parts.append(' ')
|
||||
|
||||
working = ''.join(parts)
|
||||
|
||||
# Step 3: Split into words and normalize each
|
||||
words = working.split()
|
||||
|
||||
# Step 4: Convert to lowercase and create hyphenated key
|
||||
key = '-'.join(word.lower() for word in words if word)
|
||||
|
||||
# Step 5: If we got a valid key, return it
|
||||
if key and is_valid_key(key):
|
||||
return key
|
||||
|
||||
# Step 6: Try just alphanumeric characters
|
||||
alphanumeric_only = re.sub(r'[^a-zA-Z0-9\s]', '', working)
|
||||
words = alphanumeric_only.split()
|
||||
key = '-'.join(word.lower() for word in words if word)
|
||||
|
||||
if key and is_valid_key(key):
|
||||
return key
|
||||
|
||||
# Step 7: Last resort - use folder name directly with transliteration
|
||||
# Try to convert non-ASCII to ASCII equivalents
|
||||
try:
|
||||
# Use NFD normalization and strip combining characters
|
||||
# This effectively Latinizes some characters
|
||||
nfd_form = unicodedata.normalize('NFD', folder_name)
|
||||
latinized = ''.join(
|
||||
char for char in nfd_form
|
||||
if unicodedata.category(char) != 'Mn' # Strip combining marks
|
||||
)
|
||||
# Remove non-ASCII letters
|
||||
latinized = re.sub(r'[^a-zA-Z0-9\s]', ' ', latinized)
|
||||
words = latinized.split()
|
||||
key = '-'.join(word.lower() for word in words if word)
|
||||
|
||||
if key and is_valid_key(key):
|
||||
return key
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Step 8: Absolute fallback - generate UUID-based key
|
||||
# Use first 8 chars of UUID for brevity
|
||||
uuid_key = uuid.uuid4().hex[:8]
|
||||
|
||||
# Try to extract any meaningful words from the original name
|
||||
meaningful_parts = []
|
||||
for char in folder_name:
|
||||
if char.isalnum():
|
||||
meaningful_parts.append(char.lower())
|
||||
elif len(meaningful_parts) > 0:
|
||||
meaningful_parts.append('-')
|
||||
|
||||
fallback_base = ''.join(meaningful_parts).strip('-')
|
||||
if fallback_base and len(fallback_base) >= 2:
|
||||
# Combine meaningful parts with UUID for uniqueness
|
||||
# Truncate meaningful parts if too long
|
||||
if len(fallback_base) > 20:
|
||||
fallback_base = fallback_base[:20]
|
||||
return f"{fallback_base}-{uuid_key}"
|
||||
|
||||
return f"series-{uuid_key}"
|
||||
|
||||
|
||||
def validate_key_uniqueness(
|
||||
key: str,
|
||||
existing_keys: set[str],
|
||||
) -> tuple[bool, str]:
|
||||
"""Validate that a key is unique among existing keys.
|
||||
|
||||
Args:
|
||||
key: The key to validate
|
||||
existing_keys: Set of keys that already exist
|
||||
|
||||
Returns:
|
||||
Tuple of (is_valid, error_message)
|
||||
"""
|
||||
if not key or not key.strip():
|
||||
return False, "Key cannot be empty"
|
||||
|
||||
stripped = key.strip()
|
||||
if len(stripped) < 2:
|
||||
return False, "Key must be at least 2 characters"
|
||||
|
||||
if not is_valid_key(stripped):
|
||||
return False, "Key must be URL-safe (alphanumeric, hyphens, underscores only)"
|
||||
|
||||
if stripped in existing_keys:
|
||||
return False, f"Key '{stripped}' is already in use"
|
||||
|
||||
return True, ""
|
||||
|
||||
|
||||
def sanitize_key_for_url(key: str) -> str:
|
||||
"""Sanitize a key for safe URL usage.
|
||||
|
||||
Args:
|
||||
key: The key to sanitize
|
||||
|
||||
Returns:
|
||||
URL-safe version of the key
|
||||
"""
|
||||
if not key:
|
||||
return ""
|
||||
|
||||
# Replace spaces with hyphens first
|
||||
sanitized = key.replace(' ', '-')
|
||||
|
||||
# Remove any characters that could cause URL issues (keep alphanumerics, hyphens, underscores)
|
||||
sanitized = re.sub(r'[^\w\-]', '', sanitized)
|
||||
|
||||
# Collapse multiple hyphens
|
||||
sanitized = re.sub(r'-+', '-', sanitized)
|
||||
|
||||
return sanitized.strip('-')
|
||||
|
||||
|
||||
def sanitize_url_for_logging(url: str, max_length: int = 100) -> str:
|
||||
"""Sanitize a URL for safe logging by removing sensitive query parameters.
|
||||
|
||||
Removes or truncates query parameters that may contain tokens, keys,
|
||||
or other sensitive data while preserving enough structure for debugging.
|
||||
|
||||
Args:
|
||||
url: The URL to sanitize
|
||||
max_length: Maximum length of the returned URL string
|
||||
|
||||
Returns:
|
||||
Sanitized URL safe for logging
|
||||
"""
|
||||
if not url:
|
||||
return ""
|
||||
|
||||
# Truncate if too long
|
||||
if len(url) > max_length:
|
||||
return url[:max_length] + "..."
|
||||
|
||||
return url
|
||||
Reference in New Issue
Block a user