fix(docker): resolve proxy pool configuration not loading in Docker deployment

Problem:
Docker uses 'uvicorn app:app' command which skips the if __name__ == '__main__'
block, causing load_dotenv() never executed and PROXY_URLS from .env not loaded.

Solution:
Move load_dotenv() to module level in app.py to ensure .env is loaded for all
startup methods (python app.py, uvicorn app:app, docker-compose).

Changes:
- Add module-level load_dotenv() in app.py
- Update Dockerfile version 1.0.4 -> 1.0.5
- Improve audio content display UI
- Add docs/ and scripts/ to .gitignore

Made-with: Cursor
This commit is contained in:
tmwgsicp 2026-03-29 20:29:07 +08:00
parent 9cfa0ac5b1
commit 8d90743584
5 changed files with 57 additions and 8 deletions

4
.gitignore vendored
View File

@ -67,3 +67,7 @@ data/
# SaaS 版本(独立仓库管理) # SaaS 版本(独立仓库管理)
saas/ saas/
# 个人文档和脚本(不提交)
docs/
scripts/

View File

@ -21,7 +21,7 @@ FROM python:3.11-slim
LABEL maintainer="tmwgsicp" LABEL maintainer="tmwgsicp"
LABEL description="WeChat Official Account Article Download API with RSS Support" LABEL description="WeChat Official Account Article Download API with RSS Support"
LABEL version="1.0.4" LABEL version="1.0.5"
WORKDIR /app WORKDIR /app

3
app.py
View File

@ -10,6 +10,9 @@
""" """
from contextlib import asynccontextmanager from contextlib import asynccontextmanager
from dotenv import load_dotenv
load_dotenv()
from fastapi import FastAPI from fastapi import FastAPI
from fastapi.staticfiles import StaticFiles from fastapi.staticfiles import StaticFiles

View File

@ -105,15 +105,22 @@ def extract_content(html: str) -> str:
Extract article body, trying multiple container patterns. Extract article body, trying multiple container patterns.
Different WeChat account types (government, media, personal) use Different WeChat account types (government, media, personal) use
different HTML structures. We try them in order of specificity. different HTML structures. We try them in order of specificity.
For image-text messages (item_show_type=8) and short posts (item_show_type=10), For image-text messages (item_show_type=8), short posts (item_show_type=10),
delegates to helpers. and audio share pages (item_show_type=7), delegates to helpers.
""" """
from utils.helpers import ( from utils.helpers import (
is_image_text_message, _extract_image_text_content, is_image_text_message, _extract_image_text_content,
is_short_content_message, _extract_short_content, is_short_content_message, _extract_short_content,
is_audio_message, _extract_audio_content, is_audio_message, _extract_audio_content,
get_item_show_type, _extract_audio_share_content,
) )
# Check for audio/video share pages (item_show_type=7) FIRST
# These pages use Vue apps and have no js_content div
if get_item_show_type(html) == '7':
result = _extract_audio_share_content(html)
return result.get('content', '')
if is_image_text_message(html): if is_image_text_message(html):
result = _extract_image_text_content(html) result = _extract_image_text_content(html)
return result.get('content', '') return result.get('content', '')

View File

@ -361,12 +361,14 @@ def _extract_audio_content(html: str) -> Dict:
dur_str = f' ({minutes}:{seconds:02d})' dur_str = f' ({minutes}:{seconds:02d})'
display_name = audio['name'] or f'Audio {i + 1}' display_name = audio['name'] or f'Audio {i + 1}'
# 友好提示音频需要微信鉴权不提供无法播放的URL
html_parts.append( html_parts.append(
f'<div style="margin:12px 0;padding:12px 16px;background:#f6f6f6;border-radius:8px">' f'<div style="margin:12px 0;padding:12px 16px;background:#fff9e6;'
f'<p style="margin:0 0 4px;font-size:15px;font-weight:500">' f'border-left:4px solid #fa8c16;border-radius:4px">'
f'{html_module.escape(display_name)}{dur_str}</p>' f'<p style="margin:0 0 4px;font-size:14px;color:#595959;font-weight:500">'
f'<a href="{audio["url"]}" style="color:#1890ff;font-size:14px">' f'音频内容: {html_module.escape(display_name)}{dur_str}</p>'
f'[Play Audio / Click to Listen]</a>' f'<p style="margin:0;font-size:13px;color:#8c8c8c">'
f'此文章包含音频,需要在微信中查看完整内容</p>'
f'</div>' f'</div>'
) )
@ -428,6 +430,22 @@ def _extract_audio_share_content(html: str) -> Dict:
# 生成内容 # 生成内容
content_parts = [] content_parts = []
# 标题(如果有)
if title:
content_parts.append(
f'<div style="margin:20px 0;text-align:center">'
f'<h2 style="margin:0;font-size:22px;font-weight:600;color:#262626">{title}</h2>'
f'</div>'
)
# 作者(如果有)
if author:
content_parts.append(
f'<div style="margin:12px 0;text-align:center">'
f'<p style="margin:0;font-size:14px;color:#8c8c8c">作者: {author}</p>'
f'</div>'
)
# 封面图 # 封面图
if images: if images:
for img_url in images: for img_url in images:
@ -682,6 +700,8 @@ def get_unavailable_reason(html: str) -> Optional[str]:
return None return None
# 真正的不可用标记静态HTML中的明确文字 # 真正的不可用标记静态HTML中的明确文字
# 注意微信的正常文章HTML中可能在JS代码里包含"已删除"/"违规"等字符串
# 需要确保这些关键字是在实际内容中而不是在JS字符串字面量中
markers = [ markers = [
("该内容已被发布者删除", "已被发布者删除"), ("该内容已被发布者删除", "已被发布者删除"),
("内容已删除", "已被发布者删除"), ("内容已删除", "已被发布者删除"),
@ -694,6 +714,21 @@ def get_unavailable_reason(html: str) -> Optional[str]:
] ]
for keyword, reason in markers: for keyword, reason in markers:
if keyword in html: if keyword in html:
# 额外验证如果HTML很大(>1MB) 且有真实的内容容器,
# 说明是正常文章,"已删除"/"违规"可能只是JS代码中的字符串
if len(html) > 1000000:
has_real_content = (
'id="js_content"' in html or
'class="rich_media_content' in html
)
if has_real_content:
# 进一步确认:检查关键字是否在 <body> 的前10KB可见区域
# 如果只在后面的 <script> 中出现,跳过
import re
body_match = re.search(r'<body[^>]*>(.*?)(?:<script|$)', html[:50000], re.DOTALL | re.IGNORECASE)
if body_match and keyword not in body_match.group(1):
# 关键字不在body前部可能是JS代码跳过此marker
continue
return reason return reason
# 特殊处理:"该内容暂时无法查看"独立页面 # 特殊处理:"该内容暂时无法查看"独立页面