fix: support image-text messages and multi-container article extraction

This commit is contained in:
tmwgsicp 2026-03-21 06:23:24 +08:00
parent 4825edc355
commit f9ccb1b2ae
6 changed files with 295 additions and 72 deletions

View File

@ -8,16 +8,21 @@
文章路由 - FastAPI版本
"""
import logging
import re
from typing import Optional, List
from fastapi import APIRouter, HTTPException, Request
from pydantic import BaseModel, Field
from typing import Optional, List
import re
from utils.auth_manager import auth_manager
from utils.helpers import extract_article_info, parse_article_url
from utils.helpers import extract_article_info, parse_article_url, is_image_text_message, has_article_content, get_client_ip
from utils.rate_limiter import rate_limiter
from utils.webhook import webhook
from utils.http_client import fetch_page
logger = logging.getLogger(__name__)
router = APIRouter()
class ArticleRequest(BaseModel):
@ -56,23 +61,25 @@ async def get_article(article_request: ArticleRequest, request: Request):
- `publish_time`: 发布时间戳
- `images`: 文章内的图片列表
"""
client_ip = request.client.host if request.client else "unknown"
client_ip = get_client_ip(request)
allowed, error_msg = rate_limiter.check_rate_limit(client_ip, "/api/article")
if not allowed:
return {"success": False, "error": f"⏱️ {error_msg}"}
return {"success": False, "error": f"Rate limited: {error_msg}"}
credentials = auth_manager.get_credentials()
if not credentials:
return {"success": False, "error": "服务器未登录,请先访问管理页面扫码登录"}
try:
logger.info("[Article] request from %s: %s", client_ip, article_request.url[:80])
html = await fetch_page(
article_request.url,
extra_headers={"Referer": "https://mp.weixin.qq.com/"},
timeout=120 # WeChat 大文章可能超时,延长至 120 秒
timeout=120
)
if "js_content" not in html:
if not has_article_content(html):
if "verify" in html or "验证" in html or "环境异常" in html:
await webhook.notify('verification_required', {
'url': article_request.url,

View File

@ -70,12 +70,12 @@ async def _fetch_via_proxy(
timeout=timeout
)
# 验证内容有效性: 只检查 js_content 是否存在
if "js_content" in html:
from utils.helpers import has_article_content
if has_article_content(html):
logger.info("[Proxy] len=%d", len(html))
return html
else:
logger.warning("[Proxy] invalid content (len=%d, has_js_content=False)", len(html))
logger.warning("[Proxy] invalid content (len=%d, no known content marker)", len(html))
return None
except Exception as e:

View File

@ -97,17 +97,43 @@ def _extract_div_inner(html: str, open_tag_pattern: str) -> str:
def extract_content(html: str) -> str:
"""
Extract article body from the js_content div, handling nested divs.
Extract article body, trying multiple container patterns.
Different WeChat account types (government, media, personal) use
different HTML structures. We try them in order of specificity.
For image-text messages (item_show_type=8), delegates to helpers.
"""
from utils.helpers import is_image_text_message, _extract_image_text_content
if is_image_text_message(html):
result = _extract_image_text_content(html)
return result.get('content', '')
# Pattern 1: id="js_content" (most common)
content = _extract_div_inner(html, r'<div[^>]*\bid=["\']js_content["\'][^>]*>')
if content:
return content
# Pattern 2: class contains rich_media_content
content = _extract_div_inner(html, r'<div[^>]*\bclass=["\'][^"\']*rich_media_content[^"\']*["\'][^>]*>')
if content:
return content
logger.warning("Failed to extract article body")
# Pattern 3: id="page-content" (government/institutional accounts)
content = _extract_div_inner(html, r'<div[^>]*\bid=["\']page-content["\'][^>]*>')
if content:
return content
# Pattern 4: class contains rich_media_area_primary_inner
content = _extract_div_inner(html, r'<div[^>]*\bclass=["\'][^"\']*rich_media_area_primary_inner[^"\']*["\'][^>]*>')
if content:
return content
# Pattern 5: id="js_article" (alternative article container)
content = _extract_div_inner(html, r'<div[^>]*\bid=["\']js_article["\'][^>]*>')
if content:
return content
logger.warning("Failed to extract article body from any known container")
return ""

View File

@ -62,31 +62,169 @@ def parse_article_url(url: str) -> Optional[Dict[str, str]]:
except Exception:
return None
def is_image_text_message(html: str) -> bool:
"""检测是否为图文消息item_show_type=8类似小红书多图+文字)"""
m = re.search(r"window\.item_show_type\s*=\s*'(\d+)'", html)
return m is not None and m.group(1) == '8'
def _extract_image_text_content(html: str) -> Dict:
"""
提取图文消息的内容item_show_type=8
图文消息的结构与普通文章完全不同
- 图片在 picture_page_info_list JsDecode()
- 文字在 meta description content_desc
- 没有 #js_content div
"""
import html as html_module
# 提取图片 URL从 picture_page_info_list 中的 cdn_url
# 页面中有两种格式:
# 1. picture_page_info_list: [ { cdn_url: JsDecode('...'), ... } ] (带JsDecode)
# 2. picture_page_info_list = [ { width:..., height:..., cdn_url: '...' } ] (简单格式)
# 每个 item 中第一个 cdn_url 是主图watermark_info 内的是水印,需要跳过
images = []
# 优先使用简单格式(第二种),更易解析且包含所有图片
simple_list_pos = html.find('picture_page_info_list = [')
if simple_list_pos >= 0:
bracket_start = html.find('[', simple_list_pos)
depth = 0
end = bracket_start
for end in range(bracket_start, min(bracket_start + 20000, len(html))):
if html[end] == '[':
depth += 1
elif html[end] == ']':
depth -= 1
if depth == 0:
break
block = html[bracket_start:end + 1]
# 按顶层 { 分割,每个 item 取第一个 cdn_url主图
items = re.split(r'\n\s{4,10}\{', block)
for item in items:
m = re.search(r"cdn_url:\s*'([^']+)'", item)
if m:
url = m.group(1)
if url not in images and ('mmbiz.qpic.cn' in url or 'mmbiz.qlogo.cn' in url):
images.append(url)
# 降级: 使用 JsDecode 格式
if not images:
jsdecode_list_match = re.search(
r'picture_page_info_list:\s*\[', html
)
if jsdecode_list_match:
block_start = jsdecode_list_match.end() - 1
depth = 0
end = block_start
for end in range(block_start, min(block_start + 20000, len(html))):
if html[end] == '[':
depth += 1
elif html[end] == ']':
depth -= 1
if depth == 0:
break
block = html[block_start:end + 1]
# 按顶层 { 分割
items = re.split(r'\n\s{10,30}\{(?=\s*\n\s*cdn_url)', block)
for item in items:
m = re.search(r"cdn_url:\s*JsDecode\('([^']+)'\)", item)
if m:
url = m.group(1).replace('\\x26amp;', '&').replace('\\x26', '&')
if url not in images and ('mmbiz.qpic.cn' in url or 'mmbiz.qlogo.cn' in url):
images.append(url)
# 提取文字描述
desc = ''
# 方法1: meta description
desc_match = re.search(r'<meta\s+name="description"\s+content="([^"]*)"', html)
if desc_match:
desc = desc_match.group(1)
# 处理 \x26 编码(微信的双重编码:\x26lt; -> &lt; -> <
desc = re.sub(r'\\x([0-9a-fA-F]{2})', lambda m: chr(int(m.group(1), 16)), desc)
desc = html_module.unescape(desc)
# 二次 unescape 处理双重编码
desc = html_module.unescape(desc)
# 清理 HTML 标签残留
desc = re.sub(r'<[^>]+>', '', desc)
desc = desc.replace('\\x0a', '\n').replace('\\n', '\n')
# 方法2: content_desc
if not desc:
desc_match2 = re.search(r"content_desc:\s*JsDecode\('([^']*)'\)", html)
if desc_match2:
desc = desc_match2.group(1)
desc = html_module.unescape(desc)
# 构建 HTML 内容:竖向画廊 + 文字RSS 兼容)
html_parts = []
# 竖向画廊:每张图限宽,紧凑排列,兼容主流 RSS 阅读器
if images:
gallery_imgs = []
for i, img_url in enumerate(images):
gallery_imgs.append(
f'<p style="text-align:center;margin:0 0 6px">'
f'<img src="{img_url}" data-src="{img_url}" '
f'style="max-width:480px;width:100%;height:auto;border-radius:4px" />'
f'</p>'
)
gallery_imgs.append(
f'<p style="text-align:center;color:#999;font-size:12px;margin:4px 0 0">'
f'{len(images)} images'
f'</p>'
)
html_parts.append('\n'.join(gallery_imgs))
# 文字描述区域
if desc:
text_lines = []
for line in desc.split('\n'):
line = line.strip()
if line:
text_lines.append(
f'<p style="margin:0 0 8px;line-height:1.8;font-size:15px;color:#333">{line}</p>'
)
html_parts.append('\n'.join(text_lines))
content = '\n'.join(html_parts)
plain_content = desc if desc else ''
return {
'content': content,
'plain_content': plain_content,
'images': images,
}
def extract_article_info(html: str, params: Optional[Dict] = None) -> Dict:
"""
从HTML中提取文章信息
Args:
html: 文章HTML内容
params: URL参数可选用于返回__biz等信息
Returns:
文章信息字典
"""
title = ''
# 图文消息的标题通常在 window.msg_title 中
title_match = (
re.search(r'<h1[^>]*class=[^>]*rich_media_title[^>]*>([\s\S]*?)</h1>', html, re.IGNORECASE) or
re.search(r'<h2[^>]*class=[^>]*rich_media_title[^>]*>([\s\S]*?)</h2>', html, re.IGNORECASE) or
re.search(r"var\s+msg_title\s*=\s*'([^']+)'\.html\(false\)", html) or
re.search(r"window\.msg_title\s*=\s*window\.title\s*=\s*'([^']*)'", html) or
re.search(r'<meta\s+property="og:title"\s+content="([^"]+)"', html)
)
if title_match:
title = title_match.group(1)
title = re.sub(r'<[^>]+>', '', title)
title = title.replace('&quot;', '"').replace('&amp;', '&').strip()
author = ''
author_match = (
re.search(r'<a[^>]*id="js_name"[^>]*>([\s\S]*?)</a>', html, re.IGNORECASE) or
@ -94,72 +232,81 @@ def extract_article_info(html: str, params: Optional[Dict] = None) -> Dict:
re.search(r'<meta\s+property="og:article:author"\s+content="([^"]+)"', html) or
re.search(r'<a[^>]*class=[^>]*rich_media_meta_nickname[^>]*>([^<]+)</a>', html, re.IGNORECASE)
)
if author_match:
author = author_match.group(1)
author = re.sub(r'<[^>]+>', '', author).strip()
publish_time = 0
time_match = (
re.search(r'var\s+publish_time\s*=\s*"(\d+)"', html) or
re.search(r'var\s+ct\s*=\s*"(\d+)"', html) or
re.search(r"var\s+ct\s*=\s*'(\d+)'", html) or
re.search(r'<em[^>]*id="publish_time"[^>]*>([^<]+)</em>', html)
)
if time_match:
try:
publish_time = int(time_match.group(1))
except (ValueError, TypeError):
pass
content = ''
images = []
# 方法1: 匹配 id="js_content"
content_match = re.search(r'<div[^>]*id="js_content"[^>]*>([\s\S]*?)<script[^>]*>[\s\S]*?</script>', html, re.IGNORECASE)
if not content_match:
# 方法2: 匹配 class包含rich_media_content
content_match = re.search(r'<div[^>]*class="[^"]*rich_media_content[^"]*"[^>]*>([\s\S]*?)</div>', html, re.IGNORECASE)
if content_match and content_match.group(1):
content = content_match.group(1).strip()
# 检测是否为图文消息item_show_type=8
if is_image_text_message(html):
img_text_data = _extract_image_text_content(html)
content = img_text_data['content']
images = img_text_data['images']
plain_content = img_text_data['plain_content']
else:
# 方法3: 手动截取
js_content_pos = html.find('id="js_content"')
if js_content_pos > 0:
start = html.find('>', js_content_pos) + 1
script_pos = html.find('<script', start)
if script_pos > start:
content = html[start:script_pos].strip()
if content:
# 提取data-src属性
img_regex = re.compile(r'<img[^>]+data-src="([^"]+)"')
for img_match in img_regex.finditer(content):
img_url = img_match.group(1)
if img_url not in images:
images.append(img_url)
# 提取src属性
img_regex2 = re.compile(r'<img[^>]+src="([^"]+)"')
for img_match in img_regex2.finditer(content):
img_url = img_match.group(1)
if not img_url.startswith('data:') and img_url not in images:
images.append(img_url)
content = re.sub(r'<script[^>]*>[\s\S]*?</script>', '', content, flags=re.IGNORECASE)
content = ''
images = []
# 方法1: 匹配 id="js_content"
content_match = re.search(r'<div[^>]*id="js_content"[^>]*>([\s\S]*?)<script[^>]*>[\s\S]*?</script>', html, re.IGNORECASE)
if not content_match:
# 方法2: 匹配 class包含rich_media_content
content_match = re.search(r'<div[^>]*class="[^"]*rich_media_content[^"]*"[^>]*>([\s\S]*?)</div>', html, re.IGNORECASE)
if content_match and content_match.group(1):
content = content_match.group(1).strip()
else:
# 方法3: 手动截取
js_content_pos = html.find('id="js_content"')
if js_content_pos > 0:
start = html.find('>', js_content_pos) + 1
script_pos = html.find('<script', start)
if script_pos > start:
content = html[start:script_pos].strip()
if content:
# 提取data-src属性
img_regex = re.compile(r'<img[^>]+data-src="([^"]+)"')
for img_match in img_regex.finditer(content):
img_url = img_match.group(1)
if img_url not in images:
images.append(img_url)
# 提取src属性
img_regex2 = re.compile(r'<img[^>]+src="([^"]+)"')
for img_match in img_regex2.finditer(content):
img_url = img_match.group(1)
if not img_url.startswith('data:') and img_url not in images:
images.append(img_url)
content = re.sub(r'<script[^>]*>[\s\S]*?</script>', '', content, flags=re.IGNORECASE)
plain_content = html_to_text(content) if content else ''
__biz = params.get('__biz', 'unknown') if params else 'unknown'
publish_time_str = ''
if publish_time > 0:
from datetime import datetime
dt = datetime.fromtimestamp(publish_time)
publish_time_str = dt.strftime('%Y-%m-%d %H:%M:%S')
return {
'title': title,
'content': content,
'plain_content': html_to_text(content) if content else '',
'plain_content': plain_content,
'images': images,
'author': author,
'publish_time': publish_time,
@ -167,14 +314,45 @@ def extract_article_info(html: str, params: Optional[Dict] = None) -> Dict:
'__biz': __biz
}
def has_article_content(html: str) -> bool:
"""
Check whether the fetched HTML likely contains article content.
Different WeChat account types use different content containers.
"""
content_markers = [
"js_content",
"rich_media_content",
"rich_media_area_primary",
"page-content",
"page_content",
]
if any(marker in html for marker in content_markers):
return True
return is_image_text_message(html)
def get_client_ip(request) -> str:
"""
Extract real client IP from request, respecting reverse proxy headers.
Priority: X-Forwarded-For > X-Real-IP > request.client.host
"""
forwarded_for = request.headers.get("x-forwarded-for", "")
if forwarded_for:
return forwarded_for.split(",")[0].strip()
real_ip = request.headers.get("x-real-ip", "")
if real_ip:
return real_ip.strip()
return request.client.host if request.client else "unknown"
def is_article_deleted(html: str) -> bool:
"""检查文章是否被删除"""
return '已删除' in html or 'deleted' in html.lower()
def is_need_verification(html: str) -> bool:
"""检查是否需要验证"""
return ('verify' in html.lower() or
'验证' in html or
return ('verify' in html.lower() or
'验证' in html or
'环境异常' in html)
def is_login_required(html: str) -> bool:

View File

@ -20,7 +20,7 @@ import httpx
from utils.auth_manager import auth_manager
from utils import rss_store
from utils.helpers import extract_article_info, parse_article_url
from utils.helpers import extract_article_info, parse_article_url, is_image_text_message, has_article_content
from utils.http_client import fetch_page
logger = logging.getLogger(__name__)
@ -225,8 +225,8 @@ class RSSPoller:
continue
html = results.get(link)
if not html or "js_content" not in html:
logger.warning("No content in HTML: %s", link[:80])
if not html or not has_article_content(html):
logger.warning("No content in HTML: %s", link[:80])
enriched.append(article)
continue
@ -246,7 +246,7 @@ class RSSPoller:
article_info = extract_article_info(html, parse_article_url(link))
article["author"] = article_info.get("author", "")
logger.info("Content fetched: %s... (%d chars, %d images)",
logger.info("Content fetched: %s... (%d chars, %d images)",
link[:50],
len(article["content"]),
len(result.get("images", [])))

View File

@ -133,17 +133,29 @@ def update_last_poll(fakeid: str):
# ── 文章缓存 ─────────────────────────────────────────────
def save_articles(fakeid: str, articles: List[Dict]) -> int:
"""批量保存文章,返回新增数量"""
"""
批量保存文章返回新增数量
If an article already exists but has empty content, update it with new content.
"""
conn = _get_conn()
inserted = 0
try:
for a in articles:
content = a.get("content", "")
plain_content = a.get("plain_content", "")
try:
conn.execute(
"INSERT OR IGNORE INTO articles "
cursor = conn.execute(
"INSERT INTO articles "
"(fakeid, aid, title, link, digest, cover, author, "
"content, plain_content, publish_time, fetched_at) "
"VALUES (?,?,?,?,?,?,?,?,?,?,?)",
"VALUES (?,?,?,?,?,?,?,?,?,?,?) "
"ON CONFLICT(fakeid, link) DO UPDATE SET "
"content = CASE WHEN excluded.content != '' AND articles.content = '' "
" THEN excluded.content ELSE articles.content END, "
"plain_content = CASE WHEN excluded.plain_content != '' AND articles.plain_content = '' "
" THEN excluded.plain_content ELSE articles.plain_content END, "
"author = CASE WHEN excluded.author != '' AND articles.author = '' "
" THEN excluded.author ELSE articles.author END",
(
fakeid,
a.get("aid", ""),
@ -152,13 +164,13 @@ def save_articles(fakeid: str, articles: List[Dict]) -> int:
a.get("digest", ""),
a.get("cover", ""),
a.get("author", ""),
a.get("content", ""),
a.get("plain_content", ""),
content,
plain_content,
a.get("publish_time", 0),
int(time.time()),
),
)
if conn.total_changes:
if cursor.rowcount > 0:
inserted += 1
except sqlite3.IntegrityError:
pass