fix(cache): validate cached ohli24 html before reuse

- add basic html validation before reading or writing browse cache - purge invalid cached responses instead of reusing them for cache ttl - bump anime_downloader plugin version to 0.7.19 Co-Authored-By: First Fluke <our.first.fluke@gmail.com>
2026-03-23 21:26:54 +09:00
parent 643fbd47b9
commit 1c6ec0c52e
3 changed files with 82 additions and 9 deletions
--- a/mod_ohli24.py
+++ b/mod_ohli24.py
@@ -16,6 +16,7 @@ import re
 import subprocess
 import sys
 import threading
+import time
 import traceback
 import urllib
 import unicodedata
@@ -2345,6 +2346,66 @@ class LogicOhli24(AnimeModuleBase):
        
        return response_data

+    @staticmethod
+    def _get_cache_page_type(url: str) -> str:
+        parsed = parse.urlparse(url)
+
+        if "/bbs/search.php" in parsed.path:
+            return "search"
+        if "/bbs/board.php" in parsed.path:
+            if "wr_id=" in parsed.query:
+                return "detail"
+            return "list"
+        if "/e/" in parsed.path or "/c/" in parsed.path:
+            return "detail"
+        return "generic"
+
+    @classmethod
+    def _is_valid_cached_html(cls, url: str, html_text: str) -> Tuple[bool, str]:
+        if not html_text:
+            return False, "empty"
+
+        html_text = html_text.strip()
+        if len(html_text) < 200:
+            return False, "too_short"
+
+        lowered = html_text.lower()
+        blocked_markers = [
+            "just a moment",
+            "access denied",
+            "captcha",
+            "attention required",
+            "enable javascript",
+            "cf-browser-verification",
+            "cloudflare",
+            "blocked",
+            "403 forbidden",
+            "error 403",
+            "error 404",
+            "error 500",
+            "too many requests",
+        ]
+        for marker in blocked_markers:
+            if marker in lowered:
+                return False, f"blocked:{marker}"
+
+        if "<html" not in lowered or "</html>" not in lowered:
+            return False, "not_html_document"
+
+        page_type = cls._get_cache_page_type(url)
+        markers_by_type = {
+            "list": ["list-row", "post-title", "img-item", "board-list", "list-wrap"],
+            "search": ["list-row", "post-title", "img-item", "search.php", "board-list"],
+            "detail": ["item-subject", 'itemprop="headline"', 'itemprop="image"', "view-wrap", "serial-movie-wrap"],
+            "generic": ["<body", "og:title", "viewport"],
+        }
+
+        matched = [marker for marker in markers_by_type.get(page_type, []) if marker in lowered]
+        min_markers = 1 if page_type in ("list", "search", "generic") else 2
+        if len(matched) < min_markers:
+            return False, f"missing_markers:{page_type}"
+
+        return True, f"valid:{page_type}"

    @staticmethod
    def get_html_cached(url: str, **kwargs) -> str:
@@ -2353,8 +2414,6 @@ class LogicOhli24(AnimeModuleBase):
        캐시 시간은 ohli24_cache_minutes 설정에 따름 (0=캐시 없음)
        다운로드 루틴은 이 함수를 사용하지 않음 (세션/헤더 필요)
        """
-        import hashlib
-        
        cache_minutes = int(P.ModelSetting.get("ohli24_cache_minutes") or 0)
        
        # 캐시 비활성화 시 바로 fetch
@@ -2377,11 +2436,17 @@ class LogicOhli24(AnimeModuleBase):
                try:
                    with open(cache_file, 'r', encoding='utf-8') as f:
                        cached_html = f.read()
-                    if cached_html and len(cached_html) > 100:
-                        logger.debug(f"[Cache HIT] {url[:60]}... (age: {cache_age:.0f}s)")
+                    is_valid, reason = LogicOhli24._is_valid_cached_html(url, cached_html)
+                    if is_valid:
+                        logger.debug(f"[Cache HIT] {url[:60]}... (age: {cache_age:.0f}s, reason: {reason})")
                        return cached_html
-                    else:
-                        logger.debug(f"[Cache MISS] Cached content is empty or too short for {url[:60]}...")
+
+                    logger.warning(f"[Cache INVALID] {url[:60]}... (reason: {reason})")
+                    try:
+                        os.remove(cache_file)
+                        logger.debug(f"[Cache PURGE] Removed invalid cache for {url[:60]}...")
+                    except OSError as purge_error:
+                        logger.warning(f"[Cache PURGE ERROR] {purge_error}")
                except Exception as e:
                    logger.warning(f"[Cache READ ERROR] {e}")
            else:
@@ -2393,13 +2458,16 @@ class LogicOhli24(AnimeModuleBase):
        html = LogicOhli24.get_html(url, **kwargs)
        
        # 캐시에 저장 (유효한 HTML만)
-        if html and len(html) > 100:
+        is_valid, reason = LogicOhli24._is_valid_cached_html(url, html)
+        if is_valid:
            try:
                with open(cache_file, 'w', encoding='utf-8') as f:
                    f.write(html)
-                logger.debug(f"[Cache SAVE] {url[:60]}...")
+                logger.debug(f"[Cache SAVE] {url[:60]}... (reason: {reason})")
            except Exception as e:
                logger.warning(f"[Cache WRITE ERROR] {e}")
+        elif html:
+            logger.warning(f"[Cache SKIP SAVE] {url[:60]}... (reason: {reason})")
        
        return html