From 8d907435846df03cf60d70950718368f62edeb32 Mon Sep 17 00:00:00 2001 From: tmwgsicp <2589462900@qq.com> Date: Sun, 29 Mar 2026 20:29:07 +0800 Subject: [PATCH] fix(docker): resolve proxy pool configuration not loading in Docker deployment Problem: Docker uses 'uvicorn app:app' command which skips the if __name__ == '__main__' block, causing load_dotenv() never executed and PROXY_URLS from .env not loaded. Solution: Move load_dotenv() to module level in app.py to ensure .env is loaded for all startup methods (python app.py, uvicorn app:app, docker-compose). Changes: - Add module-level load_dotenv() in app.py - Update Dockerfile version 1.0.4 -> 1.0.5 - Improve audio content display UI - Add docs/ and scripts/ to .gitignore Made-with: Cursor --- .gitignore | 4 ++++ Dockerfile | 2 +- app.py | 3 +++ utils/content_processor.py | 11 ++++++++-- utils/helpers.py | 45 +++++++++++++++++++++++++++++++++----- 5 files changed, 57 insertions(+), 8 deletions(-) diff --git a/.gitignore b/.gitignore index 11f4934..1579281 100644 --- a/.gitignore +++ b/.gitignore @@ -67,3 +67,7 @@ data/ # SaaS 版本(独立仓库管理) saas/ + +# 个人文档和脚本(不提交) +docs/ +scripts/ diff --git a/Dockerfile b/Dockerfile index 99b1c32..39bf790 100644 --- a/Dockerfile +++ b/Dockerfile @@ -21,7 +21,7 @@ FROM python:3.11-slim LABEL maintainer="tmwgsicp" LABEL description="WeChat Official Account Article Download API with RSS Support" -LABEL version="1.0.4" +LABEL version="1.0.5" WORKDIR /app diff --git a/app.py b/app.py index 383e14e..c8d77cc 100644 --- a/app.py +++ b/app.py @@ -10,6 +10,9 @@ """ from contextlib import asynccontextmanager +from dotenv import load_dotenv + +load_dotenv() from fastapi import FastAPI from fastapi.staticfiles import StaticFiles diff --git a/utils/content_processor.py b/utils/content_processor.py index b888b2c..259802a 100644 --- a/utils/content_processor.py +++ b/utils/content_processor.py @@ -105,15 +105,22 @@ def extract_content(html: str) -> str: Extract article body, trying multiple container patterns. Different WeChat account types (government, media, personal) use different HTML structures. We try them in order of specificity. - For image-text messages (item_show_type=8) and short posts (item_show_type=10), - delegates to helpers. + For image-text messages (item_show_type=8), short posts (item_show_type=10), + and audio share pages (item_show_type=7), delegates to helpers. """ from utils.helpers import ( is_image_text_message, _extract_image_text_content, is_short_content_message, _extract_short_content, is_audio_message, _extract_audio_content, + get_item_show_type, _extract_audio_share_content, ) + # Check for audio/video share pages (item_show_type=7) FIRST + # These pages use Vue apps and have no js_content div + if get_item_show_type(html) == '7': + result = _extract_audio_share_content(html) + return result.get('content', '') + if is_image_text_message(html): result = _extract_image_text_content(html) return result.get('content', '') diff --git a/utils/helpers.py b/utils/helpers.py index fbdd70b..141519b 100644 --- a/utils/helpers.py +++ b/utils/helpers.py @@ -361,12 +361,14 @@ def _extract_audio_content(html: str) -> Dict: dur_str = f' ({minutes}:{seconds:02d})' display_name = audio['name'] or f'Audio {i + 1}' + # 友好提示:音频需要微信鉴权,不提供无法播放的URL html_parts.append( - f'
' - f'

' - f'{html_module.escape(display_name)}{dur_str}

' - f'' - f'[Play Audio / Click to Listen]' + f'
' + f'

' + f'音频内容: {html_module.escape(display_name)}{dur_str}

' + f'

' + f'此文章包含音频,需要在微信中查看完整内容

' f'
' ) @@ -428,6 +430,22 @@ def _extract_audio_share_content(html: str) -> Dict: # 生成内容 content_parts = [] + # 标题(如果有) + if title: + content_parts.append( + f'
' + f'

{title}

' + f'
' + ) + + # 作者(如果有) + if author: + content_parts.append( + f'
' + f'

作者: {author}

' + f'
' + ) + # 封面图 if images: for img_url in images: @@ -682,6 +700,8 @@ def get_unavailable_reason(html: str) -> Optional[str]: return None # 真正的不可用标记(静态HTML中的明确文字) + # 注意:微信的正常文章HTML中可能在JS代码里包含"已删除"/"违规"等字符串 + # 需要确保这些关键字是在实际内容中,而不是在JS字符串字面量中 markers = [ ("该内容已被发布者删除", "已被发布者删除"), ("内容已删除", "已被发布者删除"), @@ -694,6 +714,21 @@ def get_unavailable_reason(html: str) -> Optional[str]: ] for keyword, reason in markers: if keyword in html: + # 额外验证:如果HTML很大(>1MB) 且有真实的内容容器, + # 说明是正常文章,"已删除"/"违规"可能只是JS代码中的字符串 + if len(html) > 1000000: + has_real_content = ( + 'id="js_content"' in html or + 'class="rich_media_content' in html + ) + if has_real_content: + # 进一步确认:检查关键字是否在 的前10KB可见区域 + # 如果只在后面的