fix: remove HTML size filter and fix nested div extraction
- article_fetcher: remove 500KB length check that falsely rejected valid articles (most articles are 50-300KB) - content_processor: replace non-greedy regex with depth-counting approach to correctly extract content from nested divs
This commit is contained in:
parent
ee704f87b1
commit
869c5c0c92
|
|
@ -70,17 +70,16 @@ async def _fetch_via_proxy(
|
|||
timeout=timeout
|
||||
)
|
||||
|
||||
# 验证内容有效性
|
||||
if "js_content" in html and len(html) > 500000:
|
||||
logger.info("[Proxy] ✅ len=%d", len(html))
|
||||
# 验证内容有效性: 只检查 js_content 是否存在
|
||||
if "js_content" in html:
|
||||
logger.info("[Proxy] len=%d", len(html))
|
||||
return html
|
||||
else:
|
||||
logger.warning("[Proxy] ❌ 内容无效 (len=%d, has_js_content=%s)",
|
||||
len(html), "js_content" in html)
|
||||
logger.warning("[Proxy] invalid content (len=%d, has_js_content=False)", len(html))
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
logger.error("[Proxy] ❌ %s", str(e)[:100])
|
||||
logger.error("[Proxy] %s", str(e)[:100])
|
||||
return None
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -61,35 +61,53 @@ def process_article_content(html: str, proxy_base_url: str = None) -> Dict:
|
|||
}
|
||||
|
||||
|
||||
def _extract_div_inner(html: str, open_tag_pattern: str) -> str:
|
||||
"""
|
||||
Extract the inner HTML of a <div> matched by open_tag_pattern,
|
||||
correctly handling nested <div> tags by counting open/close depth.
|
||||
"""
|
||||
m = re.search(open_tag_pattern, html, re.DOTALL | re.IGNORECASE)
|
||||
if not m:
|
||||
return ""
|
||||
|
||||
start = m.end()
|
||||
depth = 1
|
||||
pos = start
|
||||
open_re = re.compile(r'<div[\s>/]', re.IGNORECASE)
|
||||
close_re = re.compile(r'</div\s*>', re.IGNORECASE)
|
||||
|
||||
while depth > 0 and pos < len(html):
|
||||
next_open = open_re.search(html, pos)
|
||||
next_close = close_re.search(html, pos)
|
||||
|
||||
if next_close is None:
|
||||
break
|
||||
|
||||
if next_open and next_open.start() < next_close.start():
|
||||
depth += 1
|
||||
pos = next_open.end()
|
||||
else:
|
||||
depth -= 1
|
||||
if depth == 0:
|
||||
return html[start:next_close.start()].strip()
|
||||
pos = next_close.end()
|
||||
|
||||
return html[start:].strip()
|
||||
|
||||
|
||||
def extract_content(html: str) -> str:
|
||||
"""
|
||||
提取文章正文(保持原始 HTML 结构)
|
||||
|
||||
微信文章的正文在 id="js_content" 的 div 中,
|
||||
这个 div 内的 HTML 已经按正确顺序排列了文本和图片。
|
||||
Extract article body from the js_content div, handling nested divs.
|
||||
"""
|
||||
|
||||
# 方法 1: 匹配 id="js_content" (改进版,更灵活)
|
||||
match = re.search(
|
||||
r'<div[^>]*\bid=["\']js_content["\'][^>]*>(.*?)</div>',
|
||||
html,
|
||||
re.DOTALL | re.IGNORECASE
|
||||
)
|
||||
|
||||
if match:
|
||||
return match.group(1).strip()
|
||||
|
||||
# 方法 2: 匹配 class="rich_media_content"
|
||||
match = re.search(
|
||||
r'<div[^>]*\bclass=["\'][^"\']*rich_media_content[^"\']*["\'][^>]*>(.*?)</div>',
|
||||
html,
|
||||
re.DOTALL | re.IGNORECASE
|
||||
)
|
||||
|
||||
if match:
|
||||
return match.group(1).strip()
|
||||
|
||||
logger.warning("未能提取文章正文")
|
||||
content = _extract_div_inner(html, r'<div[^>]*\bid=["\']js_content["\'][^>]*>')
|
||||
if content:
|
||||
return content
|
||||
|
||||
content = _extract_div_inner(html, r'<div[^>]*\bclass=["\'][^"\']*rich_media_content[^"\']*["\'][^>]*>')
|
||||
if content:
|
||||
return content
|
||||
|
||||
logger.warning("Failed to extract article body")
|
||||
return ""
|
||||
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue