diff --git a/routes/article.py b/routes/article.py index d8291d4..18724c3 100644 --- a/routes/article.py +++ b/routes/article.py @@ -8,16 +8,21 @@ 文章路由 - FastAPI版本 """ +import logging +import re +from typing import Optional, List + from fastapi import APIRouter, HTTPException, Request from pydantic import BaseModel, Field -from typing import Optional, List -import re + from utils.auth_manager import auth_manager -from utils.helpers import extract_article_info, parse_article_url +from utils.helpers import extract_article_info, parse_article_url, is_image_text_message, has_article_content, get_client_ip from utils.rate_limiter import rate_limiter from utils.webhook import webhook from utils.http_client import fetch_page +logger = logging.getLogger(__name__) + router = APIRouter() class ArticleRequest(BaseModel): @@ -56,23 +61,25 @@ async def get_article(article_request: ArticleRequest, request: Request): - `publish_time`: 发布时间戳 - `images`: 文章内的图片列表 """ - client_ip = request.client.host if request.client else "unknown" + client_ip = get_client_ip(request) allowed, error_msg = rate_limiter.check_rate_limit(client_ip, "/api/article") if not allowed: - return {"success": False, "error": f"⏱️ {error_msg}"} + return {"success": False, "error": f"Rate limited: {error_msg}"} credentials = auth_manager.get_credentials() if not credentials: return {"success": False, "error": "服务器未登录,请先访问管理页面扫码登录"} try: + logger.info("[Article] request from %s: %s", client_ip, article_request.url[:80]) + html = await fetch_page( article_request.url, extra_headers={"Referer": "https://mp.weixin.qq.com/"}, - timeout=120 # WeChat 大文章可能超时,延长至 120 秒 + timeout=120 ) - if "js_content" not in html: + if not has_article_content(html): if "verify" in html or "验证" in html or "环境异常" in html: await webhook.notify('verification_required', { 'url': article_request.url, diff --git a/utils/article_fetcher.py b/utils/article_fetcher.py index 0c21f14..496c445 100644 --- a/utils/article_fetcher.py +++ b/utils/article_fetcher.py @@ -70,12 +70,12 @@ async def _fetch_via_proxy( timeout=timeout ) - # 验证内容有效性: 只检查 js_content 是否存在 - if "js_content" in html: + from utils.helpers import has_article_content + if has_article_content(html): logger.info("[Proxy] len=%d", len(html)) return html else: - logger.warning("[Proxy] invalid content (len=%d, has_js_content=False)", len(html)) + logger.warning("[Proxy] invalid content (len=%d, no known content marker)", len(html)) return None except Exception as e: diff --git a/utils/content_processor.py b/utils/content_processor.py index 4ca30e5..5fad64f 100644 --- a/utils/content_processor.py +++ b/utils/content_processor.py @@ -97,17 +97,43 @@ def _extract_div_inner(html: str, open_tag_pattern: str) -> str: def extract_content(html: str) -> str: """ - Extract article body from the js_content div, handling nested divs. + Extract article body, trying multiple container patterns. + Different WeChat account types (government, media, personal) use + different HTML structures. We try them in order of specificity. + For image-text messages (item_show_type=8), delegates to helpers. """ + from utils.helpers import is_image_text_message, _extract_image_text_content + + if is_image_text_message(html): + result = _extract_image_text_content(html) + return result.get('content', '') + + # Pattern 1: id="js_content" (most common) content = _extract_div_inner(html, r'
'
+ f''
+ f'
' + f'{len(images)} images' + f'
' + ) + html_parts.append('\n'.join(gallery_imgs)) + + # 文字描述区域 + if desc: + text_lines = [] + for line in desc.split('\n'): + line = line.strip() + if line: + text_lines.append( + f'{line}
' + ) + html_parts.append('\n'.join(text_lines)) + + content = '\n'.join(html_parts) + plain_content = desc if desc else '' + + return { + 'content': content, + 'plain_content': plain_content, + 'images': images, + } + + def extract_article_info(html: str, params: Optional[Dict] = None) -> Dict: """ 从HTML中提取文章信息 - + Args: html: 文章HTML内容 params: URL参数(可选,用于返回__biz等信息) - + Returns: 文章信息字典 """ - + title = '' + # 图文消息的标题通常在 window.msg_title 中 title_match = ( re.search(r'