From 8d907435846df03cf60d70950718368f62edeb32 Mon Sep 17 00:00:00 2001
From: tmwgsicp <2589462900@qq.com>
Date: Sun, 29 Mar 2026 20:29:07 +0800
Subject: [PATCH] fix(docker): resolve proxy pool configuration not loading in
 Docker deployment

Problem:
Docker uses 'uvicorn app:app' command which skips the if __name__ == '__main__'
block, causing load_dotenv() never executed and PROXY_URLS from .env not loaded.

Solution:
Move load_dotenv() to module level in app.py to ensure .env is loaded for all
startup methods (python app.py, uvicorn app:app, docker-compose).

Changes:
- Add module-level load_dotenv() in app.py
- Update Dockerfile version 1.0.4 -> 1.0.5
- Improve audio content display UI
- Add docs/ and scripts/ to .gitignore

Made-with: Cursor
---
 .gitignore                 |  4 ++++
 Dockerfile                 |  2 +-
 app.py                     |  3 +++
 utils/content_processor.py | 11 ++++++++--
 utils/helpers.py           | 45 +++++++++++++++++++++++++++++++++-----
 5 files changed, 57 insertions(+), 8 deletions(-)
diff --git a/.gitignore b/.gitignore
index 11f4934..1579281 100644
--- a/.gitignore
+++ b/.gitignore
@@ -67,3 +67,7 @@ data/
 
 # SaaS 版本（独立仓库管理）
 saas/
+
+# 个人文档和脚本（不提交）
+docs/
+scripts/
diff --git a/Dockerfile b/Dockerfile
index 99b1c32..39bf790 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -21,7 +21,7 @@ FROM python:3.11-slim
 
 LABEL maintainer="tmwgsicp"
 LABEL description="WeChat Official Account Article Download API with RSS Support"
-LABEL version="1.0.4"
+LABEL version="1.0.5"
 
 WORKDIR /app
 
diff --git a/app.py b/app.py
index 383e14e..c8d77cc 100644
--- a/app.py
+++ b/app.py
@@ -10,6 +10,9 @@
 """
 
 from contextlib import asynccontextmanager
+from dotenv import load_dotenv
+
+load_dotenv()
 
 from fastapi import FastAPI
 from fastapi.staticfiles import StaticFiles
diff --git a/utils/content_processor.py b/utils/content_processor.py
index b888b2c..259802a 100644
--- a/utils/content_processor.py
+++ b/utils/content_processor.py
@@ -105,15 +105,22 @@ def extract_content(html: str) -> str:
     Extract article body, trying multiple container patterns.
     Different WeChat account types (government, media, personal) use
     different HTML structures. We try them in order of specificity.
-    For image-text messages (item_show_type=8) and short posts (item_show_type=10),
-    delegates to helpers.
+    For image-text messages (item_show_type=8), short posts (item_show_type=10),
+    and audio share pages (item_show_type=7), delegates to helpers.
     """
     from utils.helpers import (
         is_image_text_message, _extract_image_text_content,
         is_short_content_message, _extract_short_content,
         is_audio_message, _extract_audio_content,
+        get_item_show_type, _extract_audio_share_content,
     )
 
+    # Check for audio/video share pages (item_show_type=7) FIRST
+    # These pages use Vue apps and have no js_content div
+    if get_item_show_type(html) == '7':
+        result = _extract_audio_share_content(html)
+        return result.get('content', '')
+
     if is_image_text_message(html):
         result = _extract_image_text_content(html)
         return result.get('content', '')
diff --git a/utils/helpers.py b/utils/helpers.py
index fbdd70b..141519b 100644
--- a/utils/helpers.py
+++ b/utils/helpers.py
@@ -361,12 +361,14 @@ def _extract_audio_content(html: str) -> Dict:
             dur_str = f' ({minutes}:{seconds:02d})'
 
         display_name = audio['name'] or f'Audio {i + 1}'
+        # 友好提示：音频需要微信鉴权，不提供无法播放的URL
         html_parts.append(
-            f'<div style="margin:12px 0;padding:12px 16px;background:#f6f6f6;border-radius:8px">'
-            f'<p style="margin:0 0 4px;font-size:15px;font-weight:500">'
-            f'{html_module.escape(display_name)}{dur_str}</p>'
-            f'<a href="{audio["url"]}" style="color:#1890ff;font-size:14px">'
-            f'[Play Audio / Click to Listen]</a>'
+            f'<div style="margin:12px 0;padding:12px 16px;background:#fff9e6;'
+            f'border-left:4px solid #fa8c16;border-radius:4px">'
+            f'<p style="margin:0 0 4px;font-size:14px;color:#595959;font-weight:500">'
+            f'音频内容: {html_module.escape(display_name)}{dur_str}</p>'
+            f'<p style="margin:0;font-size:13px;color:#8c8c8c">'
+            f'此文章包含音频，需要在微信中查看完整内容</p>'
             f'</div>'
         )
 
@@ -428,6 +430,22 @@ def _extract_audio_share_content(html: str) -> Dict:
     # 生成内容
     content_parts = []
     
+    # 标题（如果有）
+    if title:
+        content_parts.append(
+            f'<div style="margin:20px 0;text-align:center">'
+            f'<h2 style="margin:0;font-size:22px;font-weight:600;color:#262626">{title}</h2>'
+            f'</div>'
+        )
+    
+    # 作者（如果有）
+    if author:
+        content_parts.append(
+            f'<div style="margin:12px 0;text-align:center">'
+            f'<p style="margin:0;font-size:14px;color:#8c8c8c">作者: {author}</p>'
+            f'</div>'
+        )
+    
     # 封面图
     if images:
         for img_url in images:
@@ -682,6 +700,8 @@ def get_unavailable_reason(html: str) -> Optional[str]:
         return None
     
     # 真正的不可用标记（静态HTML中的明确文字）
+    # 注意：微信的正常文章HTML中可能在JS代码里包含"已删除"/"违规"等字符串
+    # 需要确保这些关键字是在实际内容中，而不是在JS字符串字面量中
     markers = [
         ("该内容已被发布者删除", "已被发布者删除"),
         ("内容已删除", "已被发布者删除"),
@@ -694,6 +714,21 @@ def get_unavailable_reason(html: str) -> Optional[str]:
     ]
     for keyword, reason in markers:
         if keyword in html:
+            # 额外验证：如果HTML很大(>1MB) 且有真实的内容容器，
+            # 说明是正常文章，"已删除"/"违规"可能只是JS代码中的字符串
+            if len(html) > 1000000:
+                has_real_content = (
+                    'id="js_content"' in html or
+                    'class="rich_media_content' in html
+                )
+                if has_real_content:
+                    # 进一步确认：检查关键字是否在 <body> 的前10KB可见区域
+                    # 如果只在后面的 <script> 中出现，跳过
+                    import re
+                    body_match = re.search(r'<body[^>]*>(.*?)(?:<script|$)', html[:50000], re.DOTALL | re.IGNORECASE)
+                    if body_match and keyword not in body_match.group(1):
+                        # 关键字不在body前部，可能是JS代码，跳过此marker
+                        continue
             return reason
     
     # 特殊处理："该内容暂时无法查看"独立页面