fix: remove HTML size filter and fix nested div extraction

- article_fetcher: remove 500KB length check that falsely rejected valid articles (most articles are 50-300KB)

- content_processor: replace non-greedy regex with depth-counting approach to correctly extract content from nested divs
This commit is contained in:
tmwgsicp 2026-03-18 23:15:09 +08:00
parent ee704f87b1
commit 869c5c0c92
2 changed files with 49 additions and 32 deletions

View File

@ -70,17 +70,16 @@ async def _fetch_via_proxy(
timeout=timeout
)
# 验证内容有效性
if "js_content" in html and len(html) > 500000:
logger.info("[Proxy] len=%d", len(html))
# 验证内容有效性: 只检查 js_content 是否存在
if "js_content" in html:
logger.info("[Proxy] len=%d", len(html))
return html
else:
logger.warning("[Proxy] ❌ 内容无效 (len=%d, has_js_content=%s)",
len(html), "js_content" in html)
logger.warning("[Proxy] invalid content (len=%d, has_js_content=False)", len(html))
return None
except Exception as e:
logger.error("[Proxy] %s", str(e)[:100])
logger.error("[Proxy] %s", str(e)[:100])
return None

View File

@ -61,35 +61,53 @@ def process_article_content(html: str, proxy_base_url: str = None) -> Dict:
}
def _extract_div_inner(html: str, open_tag_pattern: str) -> str:
"""
Extract the inner HTML of a <div> matched by open_tag_pattern,
correctly handling nested <div> tags by counting open/close depth.
"""
m = re.search(open_tag_pattern, html, re.DOTALL | re.IGNORECASE)
if not m:
return ""
start = m.end()
depth = 1
pos = start
open_re = re.compile(r'<div[\s>/]', re.IGNORECASE)
close_re = re.compile(r'</div\s*>', re.IGNORECASE)
while depth > 0 and pos < len(html):
next_open = open_re.search(html, pos)
next_close = close_re.search(html, pos)
if next_close is None:
break
if next_open and next_open.start() < next_close.start():
depth += 1
pos = next_open.end()
else:
depth -= 1
if depth == 0:
return html[start:next_close.start()].strip()
pos = next_close.end()
return html[start:].strip()
def extract_content(html: str) -> str:
"""
提取文章正文保持原始 HTML 结构
微信文章的正文在 id="js_content" div
这个 div 内的 HTML 已经按正确顺序排列了文本和图片
Extract article body from the js_content div, handling nested divs.
"""
# 方法 1: 匹配 id="js_content" (改进版,更灵活)
match = re.search(
r'<div[^>]*\bid=["\']js_content["\'][^>]*>(.*?)</div>',
html,
re.DOTALL | re.IGNORECASE
)
if match:
return match.group(1).strip()
# 方法 2: 匹配 class="rich_media_content"
match = re.search(
r'<div[^>]*\bclass=["\'][^"\']*rich_media_content[^"\']*["\'][^>]*>(.*?)</div>',
html,
re.DOTALL | re.IGNORECASE
)
if match:
return match.group(1).strip()
logger.warning("未能提取文章正文")
content = _extract_div_inner(html, r'<div[^>]*\bid=["\']js_content["\'][^>]*>')
if content:
return content
content = _extract_div_inner(html, r'<div[^>]*\bclass=["\'][^"\']*rich_media_content[^"\']*["\'][^>]*>')
if content:
return content
logger.warning("Failed to extract article body")
return ""