From fae9bef8340dbaee2ba74f927374b6891915de5b Mon Sep 17 00:00:00 2001 From: projectdx Date: Mon, 23 Mar 2026 21:37:03 +0900 Subject: [PATCH] fix(cache): harden ohli24 cached list parsing - reject cached list/search html when rows lack parseable link and title - skip malformed rows instead of failing whole list parsing - record the cache parsing fix in the dated history log - bump anime_downloader plugin version to 0.7.20 Co-Authored-By: First Fluke --- 2026-03-23.history.md | 15 ++++++ README.md | 5 ++ info.yaml | 2 +- mod_ohli24.py | 120 +++++++++++++++++++++++++++++++----------- 4 files changed, 111 insertions(+), 31 deletions(-) create mode 100644 2026-03-23.history.md diff --git a/2026-03-23.history.md b/2026-03-23.history.md new file mode 100644 index 0000000..ecc4c7f --- /dev/null +++ b/2026-03-23.history.md @@ -0,0 +1,15 @@ +# 2026-03-23 Work History + +## Summary +- Strengthened Ohli24 browse-cache validation so list/search cache only survives when at least one row contains both a link and a title. +- Hardened Ohli24 list, auto-list, and search parsing against nested `post-title` markup and missing image attributes. +- Prevented malformed rows from crashing the entire list response by skipping incomplete entries with warning logs. + +## Implementation Notes +- Added `_extract_text()` and `_extract_first()` helpers in [`mod_ohli24.py`](/Volumes/WD/Users/Work/python/ff_dev_plugins/anime_downloader/mod_ohli24.py) to avoid repeated unsafe XPath `[0]` access. +- Extended `_is_valid_cached_html()` to parse list/search rows and reject cache payloads that only contain placeholder rows. +- Updated changelog and plugin version to `0.7.20`. + +## Verification +- `python3 -m py_compile /Volumes/WD/Users/Work/python/ff_dev_plugins/anime_downloader/mod_ohli24.py` +- Smoke-checked validator behavior for valid list/detail HTML and blocked HTML patterns during implementation. diff --git a/README.md b/README.md index 90beacb..6f52dd0 100644 --- a/README.md +++ b/README.md @@ -84,6 +84,11 @@ ## πŸ“ λ³€κ²½ 이λ ₯ (Changelog) +### v0.7.20 (2026-03-23) +- **Ohli24 μΊμ‹œ/λͺ©λ‘ νŒŒμ‹± 보강**: + - λͺ©λ‘/검색 μΊμ‹œ μ €μž₯ 전에 μ‹€μ œλ‘œ νŒŒμ‹± κ°€λŠ₯ν•œ `href + title` 행이 μžˆλŠ”μ§€ κ²€μ‚¬ν•˜μ—¬ skeleton/λΆˆμ™„μ „ HTML이 μΊμ‹œμ— 남지 μ•Šλ„λ‘ ν–ˆμŠ΅λ‹ˆλ‹€. + - λͺ©λ‘, μžλ™λͺ©λ‘, 검색 νŒŒμ„œλ₯Ό 쀑첩 DOM ꡬ쑰에 λŒ€μ‘ν•˜λ„λ‘ μˆ˜μ •ν•˜κ³ , 제λͺ©/링크가 μ—†λŠ” 행은 전체 μ‹€νŒ¨ λŒ€μ‹  κ±΄λ„ˆλ›°λ„λ‘ λ³€κ²½ν–ˆμŠ΅λ‹ˆλ‹€. + ### v0.7.19 (2026-03-23) - **Ohli24 μΊμ‹œ 검증 1μ°¨ 적용**: - λΈŒλΌμš°μ§• μΊμ‹œ μ‚¬μš© μ „ HTML μœ νš¨μ„± 검사λ₯Ό μΆ”κ°€ν•˜μ—¬ 차단 νŽ˜μ΄μ§€λ‚˜ 비정상 응닡이 10λΆ„κ°„ μž¬μ‚¬μš©λ˜μ§€ μ•Šλ„λ‘ ν–ˆμŠ΅λ‹ˆλ‹€. diff --git a/info.yaml b/info.yaml index aadc637..a4ccc7f 100644 --- a/info.yaml +++ b/info.yaml @@ -1,5 +1,5 @@ title: "μ• λ‹ˆ λ‹€μš΄λ‘œλ”" -version: 0.7.19 +version: 0.7.20 package_name: "anime_downloader" developer: "projectdx" description: "anime downloader" diff --git a/mod_ohli24.py b/mod_ohli24.py index 8e2979e..0ab2f0c 100644 --- a/mod_ohli24.py +++ b/mod_ohli24.py @@ -1824,21 +1824,25 @@ class LogicOhli24(AnimeModuleBase): for item in tmp_items: entity = {} - entity["link"] = item.xpath(".//a/@href")[0] + entity["link"] = LogicOhli24._extract_first(item, [".//a/@href"]) + if not entity["link"]: + logger.warning("[Ohli24] Skipping list item without link") + continue entity["code"] = entity["link"].split("/")[-1] - entity["title"] = item.xpath(".//div[@class='post-title']/text()")[0].strip() - # logger.debug(item.xpath(".//div[@class='img-item']/img/@src")[0]) - # logger.debug(item.xpath(".//div[@class='img-item']/img/@data-ezsrc")[0]) - # entity["image_link"] = item.xpath(".//div[@class='img-item']/img/@src")[ - # 0 - # ].replace("..", P.ModelSetting.get("ohli24_url")) + entity["title"] = LogicOhli24._extract_text(item, ".//div[contains(@class, 'post-title')]//text()") + if not entity["title"]: + logger.warning(f"[Ohli24] Skipping list item without title: {entity['link']}") + continue - if len(item.xpath(".//div[@class='img-item']/img/@src")) > 0: - entity["image_link"] = item.xpath(".//div[@class='img-item']/img/@src")[0].replace( - "..", LogicOhli24.get_base_url() - ) - else: - entity["image_link"] = item.xpath(".//div[@class='img-item']/img/@data-ezsrc")[0] + image_link = LogicOhli24._extract_first( + item, + [ + ".//div[contains(@class, 'img-item')]//img/@src", + ".//div[contains(@class, 'img-item')]//img/@data-src", + ".//div[contains(@class, 'img-item')]//img/@data-ezsrc", + ], + ) + entity["image_link"] = image_link.replace("..", LogicOhli24.get_base_url()) if image_link else "" data["ret"] = "success" data["anime_list"].append(entity) @@ -1862,12 +1866,25 @@ class LogicOhli24(AnimeModuleBase): for item in tmp_items: entity = {} - entity["link"] = item.xpath(".//a/@href")[0] + entity["link"] = LogicOhli24._extract_first(item, [".//a/@href"]) + if not entity["link"]: + logger.warning("[Ohli24] Skipping auto list item without link") + continue entity["code"] = entity["link"].split("/")[-1] - entity["title"] = item.xpath(".//div[@class='post-title']/text()")[0].strip() - entity["image_link"] = item.xpath(".//div[@class='img-item']/img/@src")[0].replace( - "..", LogicOhli24.get_base_url() + entity["title"] = LogicOhli24._extract_text(item, ".//div[contains(@class, 'post-title')]//text()") + if not entity["title"]: + logger.warning(f"[Ohli24] Skipping auto list item without title: {entity['link']}") + continue + + image_link = LogicOhli24._extract_first( + item, + [ + ".//div[contains(@class, 'img-item')]//img/@src", + ".//div[contains(@class, 'img-item')]//img/@data-src", + ".//div[contains(@class, 'img-item')]//img/@data-ezsrc", + ], ) + entity["image_link"] = image_link.replace("..", LogicOhli24.get_base_url()) if image_link else "" data["ret"] = "success" data["anime_list"].append(entity) @@ -1901,21 +1918,26 @@ class LogicOhli24(AnimeModuleBase): # Clean up nested mess for item in tmp_items: entity = {} - entity["link"] = item.xpath(".//a/@href")[0] - # entity["code"] = entity["link"].split("/")[-1] + entity["link"] = LogicOhli24._extract_first(item, [".//a/@href"]) + if not entity["link"]: + logger.warning("[Ohli24] Skipping search item without link") + continue entity["wr_id"] = entity["link"].split("=")[-1] - # logger.debug(item.xpath(".//div[@class='post-title']/text()").join()) - entity["title"] = "".join(item.xpath(".//div[@class='post-title']/text()")).strip() + entity["title"] = LogicOhli24._extract_text(item, ".//div[contains(@class, 'post-title')]//text()") + if not entity["title"]: + logger.warning(f"[Ohli24] Skipping search item without title: {entity['link']}") + continue # Use multiple image attributes for lazy-loading support - img_attributes = [".//div[@class='img-item']/img/@src", ".//div[@class='img-item']/img/@data-src", ".//div[@class='img-item']/img/@data-ezsrc"] - original_img = "" - for attr in img_attributes: - matches = item.xpath(attr) - if matches and matches[0].strip(): - original_img = matches[0].replace("..", LogicOhli24.get_base_url()) - break - + original_img = LogicOhli24._extract_first( + item, + [ + ".//div[contains(@class, 'img-item')]//img/@src", + ".//div[contains(@class, 'img-item')]//img/@data-src", + ".//div[contains(@class, 'img-item')]//img/@data-ezsrc", + ], + ) + original_img = original_img.replace("..", LogicOhli24.get_base_url()) if original_img else "" if not original_img: original_img = "https://via.placeholder.com/200x300?text=No+Image" @@ -1926,7 +1948,13 @@ class LogicOhli24(AnimeModuleBase): urllib.parse.quote(original_img) ) - entity["code"] = item.xpath(".//div[@class='img-item']/img/@alt")[0] + entity["code"] = LogicOhli24._extract_first( + item, + [ + ".//div[contains(@class, 'img-item')]//img/@alt", + ".//a/@data-wr_id", + ], + ) or entity["wr_id"] data["ret"] = "success" data["anime_list"].append(entity) @@ -2405,8 +2433,40 @@ class LogicOhli24(AnimeModuleBase): if len(matched) < min_markers: return False, f"missing_markers:{page_type}" + if page_type in ("list", "search"): + try: + tree = html.fromstring(html_text) + rows = tree.xpath('//div[@class="list-row"]') + if rows: + valid_rows = 0 + for row in rows: + hrefs = row.xpath(".//a/@href") + title_text = "".join(row.xpath(".//div[contains(@class, 'post-title')]//text()")).strip() + if hrefs and title_text: + valid_rows += 1 + break + if valid_rows == 0: + return False, f"invalid_rows:{page_type}" + except Exception: + return False, f"parse_error:{page_type}" + return True, f"valid:{page_type}" + @staticmethod + def _extract_text(node, expression: str) -> str: + values = node.xpath(expression) + return "".join(value.strip() for value in values if isinstance(value, str) and value.strip()).strip() + + @staticmethod + def _extract_first(node, expressions: List[str]) -> str: + for expression in expressions: + values = node.xpath(expression) + if values: + value = values[0].strip() + if value: + return value + return "" + @staticmethod def get_html_cached(url: str, **kwargs) -> str: """μΊμ‹œλœ λ²„μ „μ˜ get_html - λΈŒλΌμš°μ§• νŽ˜μ΄μ§€μš© (request, search λ“±)