feat: add audio extraction, type-10 posts, and comprehensive unavailability detection

Made-with: Cursor
2026-03-23 14:19:41 +08:00 · 2026-03-23 14:19:41 +08:00 · 94a0b78ca8
parent f9ccb1b2ae
commit 94a0b78ca8
7 changed files with 330 additions and 41 deletions
--- a/.gitignore
+++ b/.gitignore
@ -35,6 +35,7 @@ venv.bak/
 .vscode/
 .idea/
 .cursor/
+.claude/
 *.swp
 *.swo
 *~
--- a/env.example
+++ b/env.example
@ -21,6 +21,8 @@ WEBHOOK_NOTIFICATION_INTERVAL=300
 # RSS 订阅配置
 # 轮询间隔（秒），默认 3600（1 小时）
 RSS_POLL_INTERVAL=3600
+# 每次轮询拉取的文章批次数，默认 10（高频更新的公众号可适当增大）
+ARTICLES_PER_POLL=10
 # RSS 轮询时是否获取完整文章内容（true/false），默认 true
 # ⚠️ 启用时强烈建议配置下方的 PROXY_URLS，避免账号被微信风控
 RSS_FETCH_FULL_CONTENT=true
--- a/utils/article_fetcher.py
+++ b/utils/article_fetcher.py
@ -44,42 +44,83 @@ async def _fetch_via_proxy(
    article_url: str, 
    timeout: int,
    wechat_cookie: Optional[str] = None,
-    wechat_token: Optional[str] = None
+    wechat_token: Optional[str] = None,
+    max_retries: int = 2
 ) -> Optional[str]:
-    """通过 SOCKS5 代理或直连获取文章"""
+    """
+    通过 SOCKS5 代理或直连获取文章
+
+    Args:
+        article_url: 文章 URL
+        timeout: 超时时间
+        wechat_cookie: 微信 Cookie
+        wechat_token: 微信 Token
+        max_retries: 内容验证失败时的最大重试次数(每次会尝试不同代理)
+    """
    try:
-        # 使用现有的 http_client（支持代理池轮转 + 直连兜底）
        from utils.http_client import fetch_page
        
-        logger.info("[Proxy] %s", article_url[:80])
+        logger.info("[Fetch] %s", article_url[:80])
        
-        # 构建完整 URL（带 token）
        full_url = article_url
        if wechat_token:
            separator = '&' if '?' in article_url else '?'
            full_url = f"{article_url}{separator}token={wechat_token}"
        
-        # 准备请求头
        extra_headers = {"Referer": "https://mp.weixin.qq.com/"}
        if wechat_cookie:
            extra_headers["Cookie"] = wechat_cookie
        
-        html = await fetch_page(
-            full_url,
-            extra_headers=extra_headers,
-            timeout=timeout
-        )
+        for attempt in range(max_retries + 1):
+            try:
+                html = await fetch_page(
+                    full_url,
+                    extra_headers=extra_headers,
+                    timeout=timeout
+                )
+                
+                from utils.helpers import has_article_content, is_article_unavailable
+
+                if is_article_unavailable(html):
+                    logger.warning("[Fetch] permanently unavailable (attempt %d/%d) %s",
+                                 attempt + 1, max_retries + 1, article_url[:60])
+                    return html
+
+                if has_article_content(html):
+                    logger.info("[Fetch] len=%d (attempt %d/%d)",
+                               len(html), attempt + 1, max_retries + 1)
+                    return html
+                else:
+                    hint = "unknown"
+                    if "验证" in html or "verify" in html.lower() or "环境异常" in html:
+                        hint = "wechat_verification"
+                    elif "请登录" in html or "login" in html.lower():
+                        hint = "login_required"
+                    elif "location.replace" in html or "location.href" in html:
+                        hint = "redirect_page"
+                    elif len(html) < 1000:
+                        hint = "empty_or_blocked"
+                    
+                    logger.warning(
+                        "[Fetch] invalid (len=%d, hint=%s, attempt %d/%d) %s",
+                        len(html), hint, attempt + 1, max_retries + 1,
+                        article_url[:60]
+                    )
+                    if attempt < max_retries:
+                        await asyncio.sleep(1)
+                        continue
+                    
+            except Exception as e:
+                logger.warning("[Fetch] request error: %s (attempt %d/%d)", 
+                             str(e)[:100], attempt + 1, max_retries + 1)
+                if attempt < max_retries:
+                    await asyncio.sleep(1)
+                    continue
        
-        from utils.helpers import has_article_content
-        if has_article_content(html):
-            logger.info("[Proxy] len=%d", len(html))
-            return html
-        else:
-            logger.warning("[Proxy] invalid content (len=%d, no known content marker)", len(html))
-            return None
+        return None
        
    except Exception as e:
-        logger.error("[Proxy] %s", str(e)[:100])
+        logger.error("[Fetch] fatal error: %s", str(e)[:100])
        return None


@ -110,9 +151,7 @@ async def fetch_articles_batch(
        async with semaphore:
            html = await fetch_article_content(url, timeout, wechat_token, wechat_cookie)
            results[url] = html
-            
-            # 避免请求过快
-            await asyncio.sleep(0.5)
+            await asyncio.sleep(1)
    
    logger.info("[Batch] 开始批量获取 %d 篇文章", len(article_urls))
    
--- a/utils/content_processor.py
+++ b/utils/content_processor.py
@ -100,14 +100,27 @@ def extract_content(html: str) -> str:
    Extract article body, trying multiple container patterns.
    Different WeChat account types (government, media, personal) use
    different HTML structures. We try them in order of specificity.
-    For image-text messages (item_show_type=8), delegates to helpers.
+    For image-text messages (item_show_type=8) and short posts (item_show_type=10),
+    delegates to helpers.
    """
-    from utils.helpers import is_image_text_message, _extract_image_text_content
+    from utils.helpers import (
+        is_image_text_message, _extract_image_text_content,
+        is_short_content_message, _extract_short_content,
+        is_audio_message, _extract_audio_content,
+    )

    if is_image_text_message(html):
        result = _extract_image_text_content(html)
        return result.get('content', '')

+    if is_short_content_message(html):
+        result = _extract_short_content(html)
+        return result.get('content', '')
+
+    if is_audio_message(html):
+        result = _extract_audio_content(html)
+        return result.get('content', '')
+
    # Pattern 1: id="js_content" (most common)
    content = _extract_div_inner(html, r'<div[^>]*\bid=["\']js_content["\'][^>]*>')
    if content:
--- a/utils/helpers.py
+++ b/utils/helpers.py
@ -62,10 +62,31 @@ def parse_article_url(url: str) -> Optional[Dict[str, str]]:
    except Exception:
        return None

+def get_item_show_type(html: str) -> Optional[str]:
+    """提取 item_show_type 值"""
+    m = re.search(r"window\.item_show_type\s*=\s*'(\d+)'", html)
+    return m.group(1) if m else None
+
+
 def is_image_text_message(html: str) -> bool:
    """检测是否为图文消息（item_show_type=8，类似小红书多图+文字）"""
-    m = re.search(r"window\.item_show_type\s*=\s*'(\d+)'", html)
-    return m is not None and m.group(1) == '8'
+    return get_item_show_type(html) == '8'
+
+
+def is_short_content_message(html: str) -> bool:
+    """检测是否为短内容/转发消息（item_show_type=10，纯文字无 js_content div）"""
+    return get_item_show_type(html) == '10'
+
+
+def is_audio_message(html: str) -> bool:
+    """
+    Detect audio articles (voice messages embedded via mpvoice / mp-common-mpaudio).
+    检测是否为音频文章（包含 mpvoice 标签或音频播放器组件）。
+    """
+    return ('voice_encode_fileid' in html or
+            '<mpvoice' in html or
+            'mp-common-mpaudio' in html or
+            'js_editor_audio' in html)


 def _extract_image_text_content(html: str) -> Dict:
@ -198,6 +219,159 @@ def _extract_image_text_content(html: str) -> Dict:
    }


+def _jsdecode_unescape(s: str) -> str:
+    """Unescape JsDecode \\xNN sequences and HTML entities."""
+    import html as html_module
+    s = re.sub(r'\\x([0-9a-fA-F]{2})', lambda m: chr(int(m.group(1), 16)), s)
+    s = html_module.unescape(s)
+    s = html_module.unescape(s)
+    return s
+
+
+def _extract_short_content(html: str) -> Dict:
+    """
+    Extract content from item_show_type=10 (short posts / reposts).
+
+    Type-10 articles have no js_content div; text and metadata are inside
+    JsDecode() calls in <script> tags.
+    """
+    import html as html_module
+
+    # content / content_noencode (prefer content_noencode for unescaped text)
+    text = ''
+    for key in ('content_noencode', 'content'):
+        m = re.search(rf"{key}:\s*JsDecode\('([^']*)'\)", html)
+        if m and len(m.group(1)) > 10:
+            text = _jsdecode_unescape(m.group(1))
+            break
+
+    # Cover / head image
+    images = []
+    img_m = re.search(r"round_head_img:\s*JsDecode\('([^']+)'\)", html)
+    if img_m:
+        img_url = _jsdecode_unescape(img_m.group(1))
+        if 'mmbiz.qpic.cn' in img_url or 'wx.qlogo.cn' in img_url:
+            images.append(img_url)
+
+    # Build HTML: simple paragraphs
+    html_parts = []
+    if text:
+        for line in text.replace('\\x0a', '\n').replace('\\n', '\n').split('\n'):
+            line = line.strip()
+            if line:
+                safe = html_module.escape(line)
+                html_parts.append(
+                    f'<p style="margin:0 0 8px;line-height:1.8;font-size:15px;color:#333">{safe}</p>'
+                )
+
+    content = '\n'.join(html_parts)
+    plain_content = text.replace('\\x0a', '\n').replace('\\n', '\n') if text else ''
+
+    return {
+        'content': content,
+        'plain_content': plain_content,
+        'images': images,
+    }
+
+
+def _extract_audio_content(html: str) -> Dict:
+    """
+    Extract audio content from WeChat voice articles.
+    音频文章使用 mpvoice / mp-common-mpaudio 标签嵌入语音，
+    通过 voice_encode_fileid 构造下载链接。
+
+    Also extracts any surrounding text content from js_content.
+    """
+    import html as html_module
+    from bs4 import BeautifulSoup
+
+    audio_items = []
+
+    # Pattern 1: <mpvoice voice_encode_fileid="..." name="..." .../>
+    for m in re.finditer(
+        r'<mpvoice[^>]*voice_encode_fileid=["\']([^"\']+)["\'][^>]*/?>',
+        html, re.IGNORECASE
+    ):
+        fileid = m.group(1)
+        name_m = re.search(r'name=["\']([^"\']*)["\']', m.group(0))
+        name = html_module.unescape(name_m.group(1)) if name_m else ''
+        play_length_m = re.search(r'play_length=["\'](\d+)["\']', m.group(0))
+        duration = int(play_length_m.group(1)) if play_length_m else 0
+        audio_url = f"https://res.wx.qq.com/voice/getvoice?mediaid={fileid}"
+        audio_items.append({'name': name, 'url': audio_url, 'duration': duration})
+
+    # Pattern 2: mp-common-mpaudio with voice_encode_fileid in data or attributes
+    if not audio_items:
+        for m in re.finditer(
+            r'<mp-common-mpaudio[^>]*voice_encode_fileid=["\']([^"\']+)["\'][^>]*>',
+            html, re.IGNORECASE
+        ):
+            fileid = m.group(1)
+            name_m = re.search(r'name=["\']([^"\']*)["\']', m.group(0))
+            name = html_module.unescape(name_m.group(1)) if name_m else ''
+            play_length_m = re.search(r'play_length=["\'](\d+)["\']', m.group(0))
+            duration = int(play_length_m.group(1)) if play_length_m else 0
+            audio_url = f"https://res.wx.qq.com/voice/getvoice?mediaid={fileid}"
+            audio_items.append({'name': name, 'url': audio_url, 'duration': duration})
+
+    # Build HTML content
+    html_parts = []
+
+    # Extract surrounding text from js_content (some audio articles have text too)
+    text_content = ''
+    js_match = re.search(
+        r'<div[^>]*id=["\']js_content["\'][^>]*>([\s\S]*?)</div>\s*(?:<script|<div[^>]*class=["\']rich_media_tool)',
+        html, re.IGNORECASE
+    )
+    if js_match:
+        try:
+            soup = BeautifulSoup(js_match.group(1), 'html.parser')
+            for tag in soup.find_all(['mpvoice', 'mp-common-mpaudio']):
+                tag.decompose()
+            text_content = soup.get_text(separator='\n', strip=True)
+        except Exception:
+            pass
+
+    if text_content:
+        for line in text_content.split('\n'):
+            line = line.strip()
+            if line:
+                html_parts.append(f'<p style="margin:0 0 8px;line-height:1.8">{html_module.escape(line)}</p>')
+
+    for i, audio in enumerate(audio_items):
+        dur_str = ''
+        if audio['duration'] > 0:
+            minutes = audio['duration'] // 60
+            seconds = audio['duration'] % 60
+            dur_str = f' ({minutes}:{seconds:02d})'
+
+        display_name = audio['name'] or f'Audio {i + 1}'
+        html_parts.append(
+            f'<div style="margin:12px 0;padding:12px 16px;background:#f6f6f6;border-radius:8px">'
+            f'<p style="margin:0 0 4px;font-size:15px;font-weight:500">'
+            f'{html_module.escape(display_name)}{dur_str}</p>'
+            f'<a href="{audio["url"]}" style="color:#1890ff;font-size:14px">'
+            f'[Play Audio / Click to Listen]</a>'
+            f'</div>'
+        )
+
+    content = '\n'.join(html_parts) if html_parts else ''
+
+    plain_parts = []
+    if text_content:
+        plain_parts.append(text_content)
+    for i, audio in enumerate(audio_items):
+        display_name = audio['name'] or f'Audio {i + 1}'
+        plain_parts.append(f"[Audio] {display_name} - {audio['url']}")
+
+    return {
+        'content': content,
+        'plain_content': '\n'.join(plain_parts),
+        'images': [],
+        'audios': audio_items,
+    }
+
+
 def extract_article_info(html: str, params: Optional[Dict] = None) -> Dict:
    """
    从HTML中提取文章信息
@ -217,11 +391,13 @@ def extract_article_info(html: str, params: Optional[Dict] = None) -> Dict:
        re.search(r'<h2[^>]*class=[^>]*rich_media_title[^>]*>([\s\S]*?)</h2>', html, re.IGNORECASE) or
        re.search(r"var\s+msg_title\s*=\s*'([^']+)'\.html\(false\)", html) or
        re.search(r"window\.msg_title\s*=\s*window\.title\s*=\s*'([^']*)'", html) or
-        re.search(r'<meta\s+property="og:title"\s+content="([^"]+)"', html)
+        re.search(r'<meta\s+property="og:title"\s+content="([^"]+)"', html) or
+        re.search(r"msg_title:\s*JsDecode\('([^']+)'\)", html)
    )

    if title_match:
        title = title_match.group(1)
+        title = _jsdecode_unescape(title)
        title = re.sub(r'<[^>]+>', '', title)
        title = title.replace('&quot;', '"').replace('&amp;', '&').strip()

@ -251,12 +427,22 @@ def extract_article_info(html: str, params: Optional[Dict] = None) -> Dict:
        except (ValueError, TypeError):
            pass

-    # 检测是否为图文消息（item_show_type=8）
+    # 检测特殊内容类型
    if is_image_text_message(html):
        img_text_data = _extract_image_text_content(html)
        content = img_text_data['content']
        images = img_text_data['images']
        plain_content = img_text_data['plain_content']
+    elif is_short_content_message(html):
+        short_data = _extract_short_content(html)
+        content = short_data['content']
+        images = short_data['images']
+        plain_content = short_data['plain_content']
+    elif is_audio_message(html):
+        audio_data = _extract_audio_content(html)
+        content = audio_data['content']
+        images = audio_data['images']
+        plain_content = audio_data['plain_content']
    else:
        content = ''
        images = []
@ -318,17 +504,23 @@ def has_article_content(html: str) -> bool:
    """
    Check whether the fetched HTML likely contains article content.
    Different WeChat account types use different content containers.
+
+    Must match actual HTML elements (id/class attributes), not random JS strings,
+    to avoid false positives on WeChat verification pages (~1.9MB) that contain
+    "js_content" references in their JavaScript code.
    """
-    content_markers = [
-        "js_content",
-        "rich_media_content",
-        "rich_media_area_primary",
-        "page-content",
-        "page_content",
+    element_markers = [
+        'id="js_content"',
+        'class="rich_media_content',
+        'class="rich_media_area_primary',
+        'id="page-content"',
+        'id="page_content"',
    ]
-    if any(marker in html for marker in content_markers):
+    if any(marker in html for marker in element_markers):
        return True
-    return is_image_text_message(html)
+    if is_image_text_message(html) or is_short_content_message(html) or is_audio_message(html):
+        return True
+    return False


 def get_client_ip(request) -> str:
@ -349,6 +541,37 @@ def is_article_deleted(html: str) -> bool:
    """检查文章是否被删除"""
    return '已删除' in html or 'deleted' in html.lower()

+
+def is_article_unavailable(html: str) -> bool:
+    """
+    Check if the article is permanently unavailable (deleted / censored / restricted).
+    检查文章是否永久不可获取（删除/违规/限制）。
+    """
+    return get_unavailable_reason(html) is not None
+
+
+def get_unavailable_reason(html: str) -> Optional[str]:
+    """
+    Return human-readable reason if article is permanently unavailable, else None.
+    返回文章不可用的原因，如果文章正常则返回 None。
+    """
+    markers = [
+        ("该内容已被发布者删除", "已被发布者删除"),
+        ("内容已删除", "已被发布者删除"),
+        ("此内容因违规无法查看", "因违规无法查看"),
+        ("涉嫌违反相关法律法规和政策", "涉嫌违规被限制"),
+        ("此内容发送失败无法查看", "发送失败无法查看"),
+        ("该内容暂时无法查看", "暂时无法查看"),
+        ("根据作者隐私设置，无法查看该内容", "作者隐私设置不可见"),
+        ("接相关投诉，此内容违反", "因投诉违规被限制"),
+        ("该文章已被第三方辟谣", "已被第三方辟谣"),
+    ]
+    for keyword, reason in markers:
+        if keyword in html:
+            return reason
+    return None
+
+
 def is_need_verification(html: str) -> bool:
    """检查是否需要验证"""
    return ('verify' in html.lower() or
--- a/utils/rss_poller.py
+++ b/utils/rss_poller.py
@ -20,13 +20,13 @@ import httpx

 from utils.auth_manager import auth_manager
 from utils import rss_store
-from utils.helpers import extract_article_info, parse_article_url, is_image_text_message, has_article_content
+from utils.helpers import extract_article_info, parse_article_url, is_image_text_message, has_article_content, is_article_unavailable, get_unavailable_reason
 from utils.http_client import fetch_page

 logger = logging.getLogger(__name__)

 POLL_INTERVAL = int(os.getenv("RSS_POLL_INTERVAL", "3600"))
-ARTICLES_PER_POLL = 10
+ARTICLES_PER_POLL = int(os.getenv("ARTICLES_PER_POLL", "10"))
 FETCH_FULL_CONTENT = os.getenv("RSS_FETCH_FULL_CONTENT", "true").lower() == "true"


@ -207,10 +207,9 @@ class RSSPoller:
        wechat_token = os.getenv("WECHAT_TOKEN", "")
        wechat_cookie = os.getenv("WECHAT_COOKIE", "")
        
-        # 批量并发获取（max_concurrency=5，传递微信凭证）
        results = await fetch_articles_batch(
            article_links, 
-            max_concurrency=5, 
+            max_concurrency=3, 
            timeout=60,
            wechat_token=wechat_token,
            wechat_cookie=wechat_cookie
@ -225,7 +224,18 @@ class RSSPoller:
                continue
            
            html = results.get(link)
-            if not html or not has_article_content(html):
+            if not html:
+                logger.warning("Empty HTML: %s", link[:80])
+                enriched.append(article)
+                continue
+            if is_article_unavailable(html):
+                reason = get_unavailable_reason(html) or "unknown"
+                logger.warning("Article permanently unavailable (%s): %s", reason, link[:80])
+                article["content"] = f"<p>[unavailable] {reason}</p>"
+                article["plain_content"] = f"[unavailable] {reason}"
+                enriched.append(article)
+                continue
+            if not has_article_content(html):
                logger.warning("No content in HTML: %s", link[:80])
                enriched.append(article)
                continue
--- a/utils/webhook.py
+++ b/utils/webhook.py
@ -22,6 +22,7 @@ EVENT_LABELS = {
    "login_success": "登录成功",
    "login_expired": "登录过期",
    "verification_required": "触发验证",
+    "content_fetch_failed": "文章内容获取失败",
 }