fix(cache): harden ohli24 cached list parsing
- reject cached list/search html when rows lack parseable link and title - skip malformed rows instead of failing whole list parsing - record the cache parsing fix in the dated history log - bump anime_downloader plugin version to 0.7.20 Co-Authored-By: First Fluke <our.first.fluke@gmail.com>
This commit is contained in:
15
2026-03-23.history.md
Normal file
15
2026-03-23.history.md
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
# 2026-03-23 Work History
|
||||||
|
|
||||||
|
## Summary
|
||||||
|
- Strengthened Ohli24 browse-cache validation so list/search cache only survives when at least one row contains both a link and a title.
|
||||||
|
- Hardened Ohli24 list, auto-list, and search parsing against nested `post-title` markup and missing image attributes.
|
||||||
|
- Prevented malformed rows from crashing the entire list response by skipping incomplete entries with warning logs.
|
||||||
|
|
||||||
|
## Implementation Notes
|
||||||
|
- Added `_extract_text()` and `_extract_first()` helpers in [`mod_ohli24.py`](/Volumes/WD/Users/Work/python/ff_dev_plugins/anime_downloader/mod_ohli24.py) to avoid repeated unsafe XPath `[0]` access.
|
||||||
|
- Extended `_is_valid_cached_html()` to parse list/search rows and reject cache payloads that only contain placeholder rows.
|
||||||
|
- Updated changelog and plugin version to `0.7.20`.
|
||||||
|
|
||||||
|
## Verification
|
||||||
|
- `python3 -m py_compile /Volumes/WD/Users/Work/python/ff_dev_plugins/anime_downloader/mod_ohli24.py`
|
||||||
|
- Smoke-checked validator behavior for valid list/detail HTML and blocked HTML patterns during implementation.
|
||||||
@@ -84,6 +84,11 @@
|
|||||||
|
|
||||||
## 📝 변경 이력 (Changelog)
|
## 📝 변경 이력 (Changelog)
|
||||||
|
|
||||||
|
### v0.7.20 (2026-03-23)
|
||||||
|
- **Ohli24 캐시/목록 파싱 보강**:
|
||||||
|
- 목록/검색 캐시 저장 전에 실제로 파싱 가능한 `href + title` 행이 있는지 검사하여 skeleton/불완전 HTML이 캐시에 남지 않도록 했습니다.
|
||||||
|
- 목록, 자동목록, 검색 파서를 중첩 DOM 구조에 대응하도록 수정하고, 제목/링크가 없는 행은 전체 실패 대신 건너뛰도록 변경했습니다.
|
||||||
|
|
||||||
### v0.7.19 (2026-03-23)
|
### v0.7.19 (2026-03-23)
|
||||||
- **Ohli24 캐시 검증 1차 적용**:
|
- **Ohli24 캐시 검증 1차 적용**:
|
||||||
- 브라우징 캐시 사용 전 HTML 유효성 검사를 추가하여 차단 페이지나 비정상 응답이 10분간 재사용되지 않도록 했습니다.
|
- 브라우징 캐시 사용 전 HTML 유효성 검사를 추가하여 차단 페이지나 비정상 응답이 10분간 재사용되지 않도록 했습니다.
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
title: "애니 다운로더"
|
title: "애니 다운로더"
|
||||||
version: 0.7.19
|
version: 0.7.20
|
||||||
package_name: "anime_downloader"
|
package_name: "anime_downloader"
|
||||||
developer: "projectdx"
|
developer: "projectdx"
|
||||||
description: "anime downloader"
|
description: "anime downloader"
|
||||||
|
|||||||
120
mod_ohli24.py
120
mod_ohli24.py
@@ -1824,21 +1824,25 @@ class LogicOhli24(AnimeModuleBase):
|
|||||||
|
|
||||||
for item in tmp_items:
|
for item in tmp_items:
|
||||||
entity = {}
|
entity = {}
|
||||||
entity["link"] = item.xpath(".//a/@href")[0]
|
entity["link"] = LogicOhli24._extract_first(item, [".//a/@href"])
|
||||||
|
if not entity["link"]:
|
||||||
|
logger.warning("[Ohli24] Skipping list item without link")
|
||||||
|
continue
|
||||||
entity["code"] = entity["link"].split("/")[-1]
|
entity["code"] = entity["link"].split("/")[-1]
|
||||||
entity["title"] = item.xpath(".//div[@class='post-title']/text()")[0].strip()
|
entity["title"] = LogicOhli24._extract_text(item, ".//div[contains(@class, 'post-title')]//text()")
|
||||||
# logger.debug(item.xpath(".//div[@class='img-item']/img/@src")[0])
|
if not entity["title"]:
|
||||||
# logger.debug(item.xpath(".//div[@class='img-item']/img/@data-ezsrc")[0])
|
logger.warning(f"[Ohli24] Skipping list item without title: {entity['link']}")
|
||||||
# entity["image_link"] = item.xpath(".//div[@class='img-item']/img/@src")[
|
continue
|
||||||
# 0
|
|
||||||
# ].replace("..", P.ModelSetting.get("ohli24_url"))
|
|
||||||
|
|
||||||
if len(item.xpath(".//div[@class='img-item']/img/@src")) > 0:
|
image_link = LogicOhli24._extract_first(
|
||||||
entity["image_link"] = item.xpath(".//div[@class='img-item']/img/@src")[0].replace(
|
item,
|
||||||
"..", LogicOhli24.get_base_url()
|
[
|
||||||
)
|
".//div[contains(@class, 'img-item')]//img/@src",
|
||||||
else:
|
".//div[contains(@class, 'img-item')]//img/@data-src",
|
||||||
entity["image_link"] = item.xpath(".//div[@class='img-item']/img/@data-ezsrc")[0]
|
".//div[contains(@class, 'img-item')]//img/@data-ezsrc",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
entity["image_link"] = image_link.replace("..", LogicOhli24.get_base_url()) if image_link else ""
|
||||||
|
|
||||||
data["ret"] = "success"
|
data["ret"] = "success"
|
||||||
data["anime_list"].append(entity)
|
data["anime_list"].append(entity)
|
||||||
@@ -1862,12 +1866,25 @@ class LogicOhli24(AnimeModuleBase):
|
|||||||
|
|
||||||
for item in tmp_items:
|
for item in tmp_items:
|
||||||
entity = {}
|
entity = {}
|
||||||
entity["link"] = item.xpath(".//a/@href")[0]
|
entity["link"] = LogicOhli24._extract_first(item, [".//a/@href"])
|
||||||
|
if not entity["link"]:
|
||||||
|
logger.warning("[Ohli24] Skipping auto list item without link")
|
||||||
|
continue
|
||||||
entity["code"] = entity["link"].split("/")[-1]
|
entity["code"] = entity["link"].split("/")[-1]
|
||||||
entity["title"] = item.xpath(".//div[@class='post-title']/text()")[0].strip()
|
entity["title"] = LogicOhli24._extract_text(item, ".//div[contains(@class, 'post-title')]//text()")
|
||||||
entity["image_link"] = item.xpath(".//div[@class='img-item']/img/@src")[0].replace(
|
if not entity["title"]:
|
||||||
"..", LogicOhli24.get_base_url()
|
logger.warning(f"[Ohli24] Skipping auto list item without title: {entity['link']}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
image_link = LogicOhli24._extract_first(
|
||||||
|
item,
|
||||||
|
[
|
||||||
|
".//div[contains(@class, 'img-item')]//img/@src",
|
||||||
|
".//div[contains(@class, 'img-item')]//img/@data-src",
|
||||||
|
".//div[contains(@class, 'img-item')]//img/@data-ezsrc",
|
||||||
|
],
|
||||||
)
|
)
|
||||||
|
entity["image_link"] = image_link.replace("..", LogicOhli24.get_base_url()) if image_link else ""
|
||||||
data["ret"] = "success"
|
data["ret"] = "success"
|
||||||
data["anime_list"].append(entity)
|
data["anime_list"].append(entity)
|
||||||
|
|
||||||
@@ -1901,21 +1918,26 @@ class LogicOhli24(AnimeModuleBase):
|
|||||||
# Clean up nested mess
|
# Clean up nested mess
|
||||||
for item in tmp_items:
|
for item in tmp_items:
|
||||||
entity = {}
|
entity = {}
|
||||||
entity["link"] = item.xpath(".//a/@href")[0]
|
entity["link"] = LogicOhli24._extract_first(item, [".//a/@href"])
|
||||||
# entity["code"] = entity["link"].split("/")[-1]
|
if not entity["link"]:
|
||||||
|
logger.warning("[Ohli24] Skipping search item without link")
|
||||||
|
continue
|
||||||
entity["wr_id"] = entity["link"].split("=")[-1]
|
entity["wr_id"] = entity["link"].split("=")[-1]
|
||||||
# logger.debug(item.xpath(".//div[@class='post-title']/text()").join())
|
entity["title"] = LogicOhli24._extract_text(item, ".//div[contains(@class, 'post-title')]//text()")
|
||||||
entity["title"] = "".join(item.xpath(".//div[@class='post-title']/text()")).strip()
|
if not entity["title"]:
|
||||||
|
logger.warning(f"[Ohli24] Skipping search item without title: {entity['link']}")
|
||||||
|
continue
|
||||||
|
|
||||||
# Use multiple image attributes for lazy-loading support
|
# Use multiple image attributes for lazy-loading support
|
||||||
img_attributes = [".//div[@class='img-item']/img/@src", ".//div[@class='img-item']/img/@data-src", ".//div[@class='img-item']/img/@data-ezsrc"]
|
original_img = LogicOhli24._extract_first(
|
||||||
original_img = ""
|
item,
|
||||||
for attr in img_attributes:
|
[
|
||||||
matches = item.xpath(attr)
|
".//div[contains(@class, 'img-item')]//img/@src",
|
||||||
if matches and matches[0].strip():
|
".//div[contains(@class, 'img-item')]//img/@data-src",
|
||||||
original_img = matches[0].replace("..", LogicOhli24.get_base_url())
|
".//div[contains(@class, 'img-item')]//img/@data-ezsrc",
|
||||||
break
|
],
|
||||||
|
)
|
||||||
|
original_img = original_img.replace("..", LogicOhli24.get_base_url()) if original_img else ""
|
||||||
if not original_img:
|
if not original_img:
|
||||||
original_img = "https://via.placeholder.com/200x300?text=No+Image"
|
original_img = "https://via.placeholder.com/200x300?text=No+Image"
|
||||||
|
|
||||||
@@ -1926,7 +1948,13 @@ class LogicOhli24(AnimeModuleBase):
|
|||||||
urllib.parse.quote(original_img)
|
urllib.parse.quote(original_img)
|
||||||
)
|
)
|
||||||
|
|
||||||
entity["code"] = item.xpath(".//div[@class='img-item']/img/@alt")[0]
|
entity["code"] = LogicOhli24._extract_first(
|
||||||
|
item,
|
||||||
|
[
|
||||||
|
".//div[contains(@class, 'img-item')]//img/@alt",
|
||||||
|
".//a/@data-wr_id",
|
||||||
|
],
|
||||||
|
) or entity["wr_id"]
|
||||||
|
|
||||||
data["ret"] = "success"
|
data["ret"] = "success"
|
||||||
data["anime_list"].append(entity)
|
data["anime_list"].append(entity)
|
||||||
@@ -2405,8 +2433,40 @@ class LogicOhli24(AnimeModuleBase):
|
|||||||
if len(matched) < min_markers:
|
if len(matched) < min_markers:
|
||||||
return False, f"missing_markers:{page_type}"
|
return False, f"missing_markers:{page_type}"
|
||||||
|
|
||||||
|
if page_type in ("list", "search"):
|
||||||
|
try:
|
||||||
|
tree = html.fromstring(html_text)
|
||||||
|
rows = tree.xpath('//div[@class="list-row"]')
|
||||||
|
if rows:
|
||||||
|
valid_rows = 0
|
||||||
|
for row in rows:
|
||||||
|
hrefs = row.xpath(".//a/@href")
|
||||||
|
title_text = "".join(row.xpath(".//div[contains(@class, 'post-title')]//text()")).strip()
|
||||||
|
if hrefs and title_text:
|
||||||
|
valid_rows += 1
|
||||||
|
break
|
||||||
|
if valid_rows == 0:
|
||||||
|
return False, f"invalid_rows:{page_type}"
|
||||||
|
except Exception:
|
||||||
|
return False, f"parse_error:{page_type}"
|
||||||
|
|
||||||
return True, f"valid:{page_type}"
|
return True, f"valid:{page_type}"
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _extract_text(node, expression: str) -> str:
|
||||||
|
values = node.xpath(expression)
|
||||||
|
return "".join(value.strip() for value in values if isinstance(value, str) and value.strip()).strip()
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _extract_first(node, expressions: List[str]) -> str:
|
||||||
|
for expression in expressions:
|
||||||
|
values = node.xpath(expression)
|
||||||
|
if values:
|
||||||
|
value = values[0].strip()
|
||||||
|
if value:
|
||||||
|
return value
|
||||||
|
return ""
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_html_cached(url: str, **kwargs) -> str:
|
def get_html_cached(url: str, **kwargs) -> str:
|
||||||
"""캐시된 버전의 get_html - 브라우징 페이지용 (request, search 등)
|
"""캐시된 버전의 get_html - 브라우징 페이지용 (request, search 등)
|
||||||
|
|||||||
Reference in New Issue
Block a user