feat: add audio extraction, type-10 posts, and comprehensive unavailability detection
Made-with: Cursor
This commit is contained in:
parent
f9ccb1b2ae
commit
94a0b78ca8
|
|
@ -35,6 +35,7 @@ venv.bak/
|
||||||
.vscode/
|
.vscode/
|
||||||
.idea/
|
.idea/
|
||||||
.cursor/
|
.cursor/
|
||||||
|
.claude/
|
||||||
*.swp
|
*.swp
|
||||||
*.swo
|
*.swo
|
||||||
*~
|
*~
|
||||||
|
|
|
||||||
|
|
@ -21,6 +21,8 @@ WEBHOOK_NOTIFICATION_INTERVAL=300
|
||||||
# RSS 订阅配置
|
# RSS 订阅配置
|
||||||
# 轮询间隔(秒),默认 3600(1 小时)
|
# 轮询间隔(秒),默认 3600(1 小时)
|
||||||
RSS_POLL_INTERVAL=3600
|
RSS_POLL_INTERVAL=3600
|
||||||
|
# 每次轮询拉取的文章批次数,默认 10(高频更新的公众号可适当增大)
|
||||||
|
ARTICLES_PER_POLL=10
|
||||||
# RSS 轮询时是否获取完整文章内容(true/false),默认 true
|
# RSS 轮询时是否获取完整文章内容(true/false),默认 true
|
||||||
# ⚠️ 启用时强烈建议配置下方的 PROXY_URLS,避免账号被微信风控
|
# ⚠️ 启用时强烈建议配置下方的 PROXY_URLS,避免账号被微信风控
|
||||||
RSS_FETCH_FULL_CONTENT=true
|
RSS_FETCH_FULL_CONTENT=true
|
||||||
|
|
|
||||||
|
|
@ -44,42 +44,83 @@ async def _fetch_via_proxy(
|
||||||
article_url: str,
|
article_url: str,
|
||||||
timeout: int,
|
timeout: int,
|
||||||
wechat_cookie: Optional[str] = None,
|
wechat_cookie: Optional[str] = None,
|
||||||
wechat_token: Optional[str] = None
|
wechat_token: Optional[str] = None,
|
||||||
|
max_retries: int = 2
|
||||||
) -> Optional[str]:
|
) -> Optional[str]:
|
||||||
"""通过 SOCKS5 代理或直连获取文章"""
|
"""
|
||||||
|
通过 SOCKS5 代理或直连获取文章
|
||||||
|
|
||||||
|
Args:
|
||||||
|
article_url: 文章 URL
|
||||||
|
timeout: 超时时间
|
||||||
|
wechat_cookie: 微信 Cookie
|
||||||
|
wechat_token: 微信 Token
|
||||||
|
max_retries: 内容验证失败时的最大重试次数(每次会尝试不同代理)
|
||||||
|
"""
|
||||||
try:
|
try:
|
||||||
# 使用现有的 http_client(支持代理池轮转 + 直连兜底)
|
|
||||||
from utils.http_client import fetch_page
|
from utils.http_client import fetch_page
|
||||||
|
|
||||||
logger.info("[Proxy] %s", article_url[:80])
|
logger.info("[Fetch] %s", article_url[:80])
|
||||||
|
|
||||||
# 构建完整 URL(带 token)
|
|
||||||
full_url = article_url
|
full_url = article_url
|
||||||
if wechat_token:
|
if wechat_token:
|
||||||
separator = '&' if '?' in article_url else '?'
|
separator = '&' if '?' in article_url else '?'
|
||||||
full_url = f"{article_url}{separator}token={wechat_token}"
|
full_url = f"{article_url}{separator}token={wechat_token}"
|
||||||
|
|
||||||
# 准备请求头
|
|
||||||
extra_headers = {"Referer": "https://mp.weixin.qq.com/"}
|
extra_headers = {"Referer": "https://mp.weixin.qq.com/"}
|
||||||
if wechat_cookie:
|
if wechat_cookie:
|
||||||
extra_headers["Cookie"] = wechat_cookie
|
extra_headers["Cookie"] = wechat_cookie
|
||||||
|
|
||||||
html = await fetch_page(
|
for attempt in range(max_retries + 1):
|
||||||
full_url,
|
try:
|
||||||
extra_headers=extra_headers,
|
html = await fetch_page(
|
||||||
timeout=timeout
|
full_url,
|
||||||
)
|
extra_headers=extra_headers,
|
||||||
|
timeout=timeout
|
||||||
|
)
|
||||||
|
|
||||||
|
from utils.helpers import has_article_content, is_article_unavailable
|
||||||
|
|
||||||
|
if is_article_unavailable(html):
|
||||||
|
logger.warning("[Fetch] permanently unavailable (attempt %d/%d) %s",
|
||||||
|
attempt + 1, max_retries + 1, article_url[:60])
|
||||||
|
return html
|
||||||
|
|
||||||
|
if has_article_content(html):
|
||||||
|
logger.info("[Fetch] len=%d (attempt %d/%d)",
|
||||||
|
len(html), attempt + 1, max_retries + 1)
|
||||||
|
return html
|
||||||
|
else:
|
||||||
|
hint = "unknown"
|
||||||
|
if "验证" in html or "verify" in html.lower() or "环境异常" in html:
|
||||||
|
hint = "wechat_verification"
|
||||||
|
elif "请登录" in html or "login" in html.lower():
|
||||||
|
hint = "login_required"
|
||||||
|
elif "location.replace" in html or "location.href" in html:
|
||||||
|
hint = "redirect_page"
|
||||||
|
elif len(html) < 1000:
|
||||||
|
hint = "empty_or_blocked"
|
||||||
|
|
||||||
|
logger.warning(
|
||||||
|
"[Fetch] invalid (len=%d, hint=%s, attempt %d/%d) %s",
|
||||||
|
len(html), hint, attempt + 1, max_retries + 1,
|
||||||
|
article_url[:60]
|
||||||
|
)
|
||||||
|
if attempt < max_retries:
|
||||||
|
await asyncio.sleep(1)
|
||||||
|
continue
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("[Fetch] request error: %s (attempt %d/%d)",
|
||||||
|
str(e)[:100], attempt + 1, max_retries + 1)
|
||||||
|
if attempt < max_retries:
|
||||||
|
await asyncio.sleep(1)
|
||||||
|
continue
|
||||||
|
|
||||||
from utils.helpers import has_article_content
|
return None
|
||||||
if has_article_content(html):
|
|
||||||
logger.info("[Proxy] len=%d", len(html))
|
|
||||||
return html
|
|
||||||
else:
|
|
||||||
logger.warning("[Proxy] invalid content (len=%d, no known content marker)", len(html))
|
|
||||||
return None
|
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error("[Proxy] %s", str(e)[:100])
|
logger.error("[Fetch] fatal error: %s", str(e)[:100])
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -110,9 +151,7 @@ async def fetch_articles_batch(
|
||||||
async with semaphore:
|
async with semaphore:
|
||||||
html = await fetch_article_content(url, timeout, wechat_token, wechat_cookie)
|
html = await fetch_article_content(url, timeout, wechat_token, wechat_cookie)
|
||||||
results[url] = html
|
results[url] = html
|
||||||
|
await asyncio.sleep(1)
|
||||||
# 避免请求过快
|
|
||||||
await asyncio.sleep(0.5)
|
|
||||||
|
|
||||||
logger.info("[Batch] 开始批量获取 %d 篇文章", len(article_urls))
|
logger.info("[Batch] 开始批量获取 %d 篇文章", len(article_urls))
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -100,14 +100,27 @@ def extract_content(html: str) -> str:
|
||||||
Extract article body, trying multiple container patterns.
|
Extract article body, trying multiple container patterns.
|
||||||
Different WeChat account types (government, media, personal) use
|
Different WeChat account types (government, media, personal) use
|
||||||
different HTML structures. We try them in order of specificity.
|
different HTML structures. We try them in order of specificity.
|
||||||
For image-text messages (item_show_type=8), delegates to helpers.
|
For image-text messages (item_show_type=8) and short posts (item_show_type=10),
|
||||||
|
delegates to helpers.
|
||||||
"""
|
"""
|
||||||
from utils.helpers import is_image_text_message, _extract_image_text_content
|
from utils.helpers import (
|
||||||
|
is_image_text_message, _extract_image_text_content,
|
||||||
|
is_short_content_message, _extract_short_content,
|
||||||
|
is_audio_message, _extract_audio_content,
|
||||||
|
)
|
||||||
|
|
||||||
if is_image_text_message(html):
|
if is_image_text_message(html):
|
||||||
result = _extract_image_text_content(html)
|
result = _extract_image_text_content(html)
|
||||||
return result.get('content', '')
|
return result.get('content', '')
|
||||||
|
|
||||||
|
if is_short_content_message(html):
|
||||||
|
result = _extract_short_content(html)
|
||||||
|
return result.get('content', '')
|
||||||
|
|
||||||
|
if is_audio_message(html):
|
||||||
|
result = _extract_audio_content(html)
|
||||||
|
return result.get('content', '')
|
||||||
|
|
||||||
# Pattern 1: id="js_content" (most common)
|
# Pattern 1: id="js_content" (most common)
|
||||||
content = _extract_div_inner(html, r'<div[^>]*\bid=["\']js_content["\'][^>]*>')
|
content = _extract_div_inner(html, r'<div[^>]*\bid=["\']js_content["\'][^>]*>')
|
||||||
if content:
|
if content:
|
||||||
|
|
|
||||||
247
utils/helpers.py
247
utils/helpers.py
|
|
@ -62,10 +62,31 @@ def parse_article_url(url: str) -> Optional[Dict[str, str]]:
|
||||||
except Exception:
|
except Exception:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
def get_item_show_type(html: str) -> Optional[str]:
|
||||||
|
"""提取 item_show_type 值"""
|
||||||
|
m = re.search(r"window\.item_show_type\s*=\s*'(\d+)'", html)
|
||||||
|
return m.group(1) if m else None
|
||||||
|
|
||||||
|
|
||||||
def is_image_text_message(html: str) -> bool:
|
def is_image_text_message(html: str) -> bool:
|
||||||
"""检测是否为图文消息(item_show_type=8,类似小红书多图+文字)"""
|
"""检测是否为图文消息(item_show_type=8,类似小红书多图+文字)"""
|
||||||
m = re.search(r"window\.item_show_type\s*=\s*'(\d+)'", html)
|
return get_item_show_type(html) == '8'
|
||||||
return m is not None and m.group(1) == '8'
|
|
||||||
|
|
||||||
|
def is_short_content_message(html: str) -> bool:
|
||||||
|
"""检测是否为短内容/转发消息(item_show_type=10,纯文字无 js_content div)"""
|
||||||
|
return get_item_show_type(html) == '10'
|
||||||
|
|
||||||
|
|
||||||
|
def is_audio_message(html: str) -> bool:
|
||||||
|
"""
|
||||||
|
Detect audio articles (voice messages embedded via mpvoice / mp-common-mpaudio).
|
||||||
|
检测是否为音频文章(包含 mpvoice 标签或音频播放器组件)。
|
||||||
|
"""
|
||||||
|
return ('voice_encode_fileid' in html or
|
||||||
|
'<mpvoice' in html or
|
||||||
|
'mp-common-mpaudio' in html or
|
||||||
|
'js_editor_audio' in html)
|
||||||
|
|
||||||
|
|
||||||
def _extract_image_text_content(html: str) -> Dict:
|
def _extract_image_text_content(html: str) -> Dict:
|
||||||
|
|
@ -198,6 +219,159 @@ def _extract_image_text_content(html: str) -> Dict:
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _jsdecode_unescape(s: str) -> str:
|
||||||
|
"""Unescape JsDecode \\xNN sequences and HTML entities."""
|
||||||
|
import html as html_module
|
||||||
|
s = re.sub(r'\\x([0-9a-fA-F]{2})', lambda m: chr(int(m.group(1), 16)), s)
|
||||||
|
s = html_module.unescape(s)
|
||||||
|
s = html_module.unescape(s)
|
||||||
|
return s
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_short_content(html: str) -> Dict:
|
||||||
|
"""
|
||||||
|
Extract content from item_show_type=10 (short posts / reposts).
|
||||||
|
|
||||||
|
Type-10 articles have no js_content div; text and metadata are inside
|
||||||
|
JsDecode() calls in <script> tags.
|
||||||
|
"""
|
||||||
|
import html as html_module
|
||||||
|
|
||||||
|
# content / content_noencode (prefer content_noencode for unescaped text)
|
||||||
|
text = ''
|
||||||
|
for key in ('content_noencode', 'content'):
|
||||||
|
m = re.search(rf"{key}:\s*JsDecode\('([^']*)'\)", html)
|
||||||
|
if m and len(m.group(1)) > 10:
|
||||||
|
text = _jsdecode_unescape(m.group(1))
|
||||||
|
break
|
||||||
|
|
||||||
|
# Cover / head image
|
||||||
|
images = []
|
||||||
|
img_m = re.search(r"round_head_img:\s*JsDecode\('([^']+)'\)", html)
|
||||||
|
if img_m:
|
||||||
|
img_url = _jsdecode_unescape(img_m.group(1))
|
||||||
|
if 'mmbiz.qpic.cn' in img_url or 'wx.qlogo.cn' in img_url:
|
||||||
|
images.append(img_url)
|
||||||
|
|
||||||
|
# Build HTML: simple paragraphs
|
||||||
|
html_parts = []
|
||||||
|
if text:
|
||||||
|
for line in text.replace('\\x0a', '\n').replace('\\n', '\n').split('\n'):
|
||||||
|
line = line.strip()
|
||||||
|
if line:
|
||||||
|
safe = html_module.escape(line)
|
||||||
|
html_parts.append(
|
||||||
|
f'<p style="margin:0 0 8px;line-height:1.8;font-size:15px;color:#333">{safe}</p>'
|
||||||
|
)
|
||||||
|
|
||||||
|
content = '\n'.join(html_parts)
|
||||||
|
plain_content = text.replace('\\x0a', '\n').replace('\\n', '\n') if text else ''
|
||||||
|
|
||||||
|
return {
|
||||||
|
'content': content,
|
||||||
|
'plain_content': plain_content,
|
||||||
|
'images': images,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_audio_content(html: str) -> Dict:
|
||||||
|
"""
|
||||||
|
Extract audio content from WeChat voice articles.
|
||||||
|
音频文章使用 mpvoice / mp-common-mpaudio 标签嵌入语音,
|
||||||
|
通过 voice_encode_fileid 构造下载链接。
|
||||||
|
|
||||||
|
Also extracts any surrounding text content from js_content.
|
||||||
|
"""
|
||||||
|
import html as html_module
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
audio_items = []
|
||||||
|
|
||||||
|
# Pattern 1: <mpvoice voice_encode_fileid="..." name="..." .../>
|
||||||
|
for m in re.finditer(
|
||||||
|
r'<mpvoice[^>]*voice_encode_fileid=["\']([^"\']+)["\'][^>]*/?>',
|
||||||
|
html, re.IGNORECASE
|
||||||
|
):
|
||||||
|
fileid = m.group(1)
|
||||||
|
name_m = re.search(r'name=["\']([^"\']*)["\']', m.group(0))
|
||||||
|
name = html_module.unescape(name_m.group(1)) if name_m else ''
|
||||||
|
play_length_m = re.search(r'play_length=["\'](\d+)["\']', m.group(0))
|
||||||
|
duration = int(play_length_m.group(1)) if play_length_m else 0
|
||||||
|
audio_url = f"https://res.wx.qq.com/voice/getvoice?mediaid={fileid}"
|
||||||
|
audio_items.append({'name': name, 'url': audio_url, 'duration': duration})
|
||||||
|
|
||||||
|
# Pattern 2: mp-common-mpaudio with voice_encode_fileid in data or attributes
|
||||||
|
if not audio_items:
|
||||||
|
for m in re.finditer(
|
||||||
|
r'<mp-common-mpaudio[^>]*voice_encode_fileid=["\']([^"\']+)["\'][^>]*>',
|
||||||
|
html, re.IGNORECASE
|
||||||
|
):
|
||||||
|
fileid = m.group(1)
|
||||||
|
name_m = re.search(r'name=["\']([^"\']*)["\']', m.group(0))
|
||||||
|
name = html_module.unescape(name_m.group(1)) if name_m else ''
|
||||||
|
play_length_m = re.search(r'play_length=["\'](\d+)["\']', m.group(0))
|
||||||
|
duration = int(play_length_m.group(1)) if play_length_m else 0
|
||||||
|
audio_url = f"https://res.wx.qq.com/voice/getvoice?mediaid={fileid}"
|
||||||
|
audio_items.append({'name': name, 'url': audio_url, 'duration': duration})
|
||||||
|
|
||||||
|
# Build HTML content
|
||||||
|
html_parts = []
|
||||||
|
|
||||||
|
# Extract surrounding text from js_content (some audio articles have text too)
|
||||||
|
text_content = ''
|
||||||
|
js_match = re.search(
|
||||||
|
r'<div[^>]*id=["\']js_content["\'][^>]*>([\s\S]*?)</div>\s*(?:<script|<div[^>]*class=["\']rich_media_tool)',
|
||||||
|
html, re.IGNORECASE
|
||||||
|
)
|
||||||
|
if js_match:
|
||||||
|
try:
|
||||||
|
soup = BeautifulSoup(js_match.group(1), 'html.parser')
|
||||||
|
for tag in soup.find_all(['mpvoice', 'mp-common-mpaudio']):
|
||||||
|
tag.decompose()
|
||||||
|
text_content = soup.get_text(separator='\n', strip=True)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
if text_content:
|
||||||
|
for line in text_content.split('\n'):
|
||||||
|
line = line.strip()
|
||||||
|
if line:
|
||||||
|
html_parts.append(f'<p style="margin:0 0 8px;line-height:1.8">{html_module.escape(line)}</p>')
|
||||||
|
|
||||||
|
for i, audio in enumerate(audio_items):
|
||||||
|
dur_str = ''
|
||||||
|
if audio['duration'] > 0:
|
||||||
|
minutes = audio['duration'] // 60
|
||||||
|
seconds = audio['duration'] % 60
|
||||||
|
dur_str = f' ({minutes}:{seconds:02d})'
|
||||||
|
|
||||||
|
display_name = audio['name'] or f'Audio {i + 1}'
|
||||||
|
html_parts.append(
|
||||||
|
f'<div style="margin:12px 0;padding:12px 16px;background:#f6f6f6;border-radius:8px">'
|
||||||
|
f'<p style="margin:0 0 4px;font-size:15px;font-weight:500">'
|
||||||
|
f'{html_module.escape(display_name)}{dur_str}</p>'
|
||||||
|
f'<a href="{audio["url"]}" style="color:#1890ff;font-size:14px">'
|
||||||
|
f'[Play Audio / Click to Listen]</a>'
|
||||||
|
f'</div>'
|
||||||
|
)
|
||||||
|
|
||||||
|
content = '\n'.join(html_parts) if html_parts else ''
|
||||||
|
|
||||||
|
plain_parts = []
|
||||||
|
if text_content:
|
||||||
|
plain_parts.append(text_content)
|
||||||
|
for i, audio in enumerate(audio_items):
|
||||||
|
display_name = audio['name'] or f'Audio {i + 1}'
|
||||||
|
plain_parts.append(f"[Audio] {display_name} - {audio['url']}")
|
||||||
|
|
||||||
|
return {
|
||||||
|
'content': content,
|
||||||
|
'plain_content': '\n'.join(plain_parts),
|
||||||
|
'images': [],
|
||||||
|
'audios': audio_items,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def extract_article_info(html: str, params: Optional[Dict] = None) -> Dict:
|
def extract_article_info(html: str, params: Optional[Dict] = None) -> Dict:
|
||||||
"""
|
"""
|
||||||
从HTML中提取文章信息
|
从HTML中提取文章信息
|
||||||
|
|
@ -217,11 +391,13 @@ def extract_article_info(html: str, params: Optional[Dict] = None) -> Dict:
|
||||||
re.search(r'<h2[^>]*class=[^>]*rich_media_title[^>]*>([\s\S]*?)</h2>', html, re.IGNORECASE) or
|
re.search(r'<h2[^>]*class=[^>]*rich_media_title[^>]*>([\s\S]*?)</h2>', html, re.IGNORECASE) or
|
||||||
re.search(r"var\s+msg_title\s*=\s*'([^']+)'\.html\(false\)", html) or
|
re.search(r"var\s+msg_title\s*=\s*'([^']+)'\.html\(false\)", html) or
|
||||||
re.search(r"window\.msg_title\s*=\s*window\.title\s*=\s*'([^']*)'", html) or
|
re.search(r"window\.msg_title\s*=\s*window\.title\s*=\s*'([^']*)'", html) or
|
||||||
re.search(r'<meta\s+property="og:title"\s+content="([^"]+)"', html)
|
re.search(r'<meta\s+property="og:title"\s+content="([^"]+)"', html) or
|
||||||
|
re.search(r"msg_title:\s*JsDecode\('([^']+)'\)", html)
|
||||||
)
|
)
|
||||||
|
|
||||||
if title_match:
|
if title_match:
|
||||||
title = title_match.group(1)
|
title = title_match.group(1)
|
||||||
|
title = _jsdecode_unescape(title)
|
||||||
title = re.sub(r'<[^>]+>', '', title)
|
title = re.sub(r'<[^>]+>', '', title)
|
||||||
title = title.replace('"', '"').replace('&', '&').strip()
|
title = title.replace('"', '"').replace('&', '&').strip()
|
||||||
|
|
||||||
|
|
@ -251,12 +427,22 @@ def extract_article_info(html: str, params: Optional[Dict] = None) -> Dict:
|
||||||
except (ValueError, TypeError):
|
except (ValueError, TypeError):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
# 检测是否为图文消息(item_show_type=8)
|
# 检测特殊内容类型
|
||||||
if is_image_text_message(html):
|
if is_image_text_message(html):
|
||||||
img_text_data = _extract_image_text_content(html)
|
img_text_data = _extract_image_text_content(html)
|
||||||
content = img_text_data['content']
|
content = img_text_data['content']
|
||||||
images = img_text_data['images']
|
images = img_text_data['images']
|
||||||
plain_content = img_text_data['plain_content']
|
plain_content = img_text_data['plain_content']
|
||||||
|
elif is_short_content_message(html):
|
||||||
|
short_data = _extract_short_content(html)
|
||||||
|
content = short_data['content']
|
||||||
|
images = short_data['images']
|
||||||
|
plain_content = short_data['plain_content']
|
||||||
|
elif is_audio_message(html):
|
||||||
|
audio_data = _extract_audio_content(html)
|
||||||
|
content = audio_data['content']
|
||||||
|
images = audio_data['images']
|
||||||
|
plain_content = audio_data['plain_content']
|
||||||
else:
|
else:
|
||||||
content = ''
|
content = ''
|
||||||
images = []
|
images = []
|
||||||
|
|
@ -318,17 +504,23 @@ def has_article_content(html: str) -> bool:
|
||||||
"""
|
"""
|
||||||
Check whether the fetched HTML likely contains article content.
|
Check whether the fetched HTML likely contains article content.
|
||||||
Different WeChat account types use different content containers.
|
Different WeChat account types use different content containers.
|
||||||
|
|
||||||
|
Must match actual HTML elements (id/class attributes), not random JS strings,
|
||||||
|
to avoid false positives on WeChat verification pages (~1.9MB) that contain
|
||||||
|
"js_content" references in their JavaScript code.
|
||||||
"""
|
"""
|
||||||
content_markers = [
|
element_markers = [
|
||||||
"js_content",
|
'id="js_content"',
|
||||||
"rich_media_content",
|
'class="rich_media_content',
|
||||||
"rich_media_area_primary",
|
'class="rich_media_area_primary',
|
||||||
"page-content",
|
'id="page-content"',
|
||||||
"page_content",
|
'id="page_content"',
|
||||||
]
|
]
|
||||||
if any(marker in html for marker in content_markers):
|
if any(marker in html for marker in element_markers):
|
||||||
return True
|
return True
|
||||||
return is_image_text_message(html)
|
if is_image_text_message(html) or is_short_content_message(html) or is_audio_message(html):
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
def get_client_ip(request) -> str:
|
def get_client_ip(request) -> str:
|
||||||
|
|
@ -349,6 +541,37 @@ def is_article_deleted(html: str) -> bool:
|
||||||
"""检查文章是否被删除"""
|
"""检查文章是否被删除"""
|
||||||
return '已删除' in html or 'deleted' in html.lower()
|
return '已删除' in html or 'deleted' in html.lower()
|
||||||
|
|
||||||
|
|
||||||
|
def is_article_unavailable(html: str) -> bool:
|
||||||
|
"""
|
||||||
|
Check if the article is permanently unavailable (deleted / censored / restricted).
|
||||||
|
检查文章是否永久不可获取(删除/违规/限制)。
|
||||||
|
"""
|
||||||
|
return get_unavailable_reason(html) is not None
|
||||||
|
|
||||||
|
|
||||||
|
def get_unavailable_reason(html: str) -> Optional[str]:
|
||||||
|
"""
|
||||||
|
Return human-readable reason if article is permanently unavailable, else None.
|
||||||
|
返回文章不可用的原因,如果文章正常则返回 None。
|
||||||
|
"""
|
||||||
|
markers = [
|
||||||
|
("该内容已被发布者删除", "已被发布者删除"),
|
||||||
|
("内容已删除", "已被发布者删除"),
|
||||||
|
("此内容因违规无法查看", "因违规无法查看"),
|
||||||
|
("涉嫌违反相关法律法规和政策", "涉嫌违规被限制"),
|
||||||
|
("此内容发送失败无法查看", "发送失败无法查看"),
|
||||||
|
("该内容暂时无法查看", "暂时无法查看"),
|
||||||
|
("根据作者隐私设置,无法查看该内容", "作者隐私设置不可见"),
|
||||||
|
("接相关投诉,此内容违反", "因投诉违规被限制"),
|
||||||
|
("该文章已被第三方辟谣", "已被第三方辟谣"),
|
||||||
|
]
|
||||||
|
for keyword, reason in markers:
|
||||||
|
if keyword in html:
|
||||||
|
return reason
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
def is_need_verification(html: str) -> bool:
|
def is_need_verification(html: str) -> bool:
|
||||||
"""检查是否需要验证"""
|
"""检查是否需要验证"""
|
||||||
return ('verify' in html.lower() or
|
return ('verify' in html.lower() or
|
||||||
|
|
|
||||||
|
|
@ -20,13 +20,13 @@ import httpx
|
||||||
|
|
||||||
from utils.auth_manager import auth_manager
|
from utils.auth_manager import auth_manager
|
||||||
from utils import rss_store
|
from utils import rss_store
|
||||||
from utils.helpers import extract_article_info, parse_article_url, is_image_text_message, has_article_content
|
from utils.helpers import extract_article_info, parse_article_url, is_image_text_message, has_article_content, is_article_unavailable, get_unavailable_reason
|
||||||
from utils.http_client import fetch_page
|
from utils.http_client import fetch_page
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
POLL_INTERVAL = int(os.getenv("RSS_POLL_INTERVAL", "3600"))
|
POLL_INTERVAL = int(os.getenv("RSS_POLL_INTERVAL", "3600"))
|
||||||
ARTICLES_PER_POLL = 10
|
ARTICLES_PER_POLL = int(os.getenv("ARTICLES_PER_POLL", "10"))
|
||||||
FETCH_FULL_CONTENT = os.getenv("RSS_FETCH_FULL_CONTENT", "true").lower() == "true"
|
FETCH_FULL_CONTENT = os.getenv("RSS_FETCH_FULL_CONTENT", "true").lower() == "true"
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -207,10 +207,9 @@ class RSSPoller:
|
||||||
wechat_token = os.getenv("WECHAT_TOKEN", "")
|
wechat_token = os.getenv("WECHAT_TOKEN", "")
|
||||||
wechat_cookie = os.getenv("WECHAT_COOKIE", "")
|
wechat_cookie = os.getenv("WECHAT_COOKIE", "")
|
||||||
|
|
||||||
# 批量并发获取(max_concurrency=5,传递微信凭证)
|
|
||||||
results = await fetch_articles_batch(
|
results = await fetch_articles_batch(
|
||||||
article_links,
|
article_links,
|
||||||
max_concurrency=5,
|
max_concurrency=3,
|
||||||
timeout=60,
|
timeout=60,
|
||||||
wechat_token=wechat_token,
|
wechat_token=wechat_token,
|
||||||
wechat_cookie=wechat_cookie
|
wechat_cookie=wechat_cookie
|
||||||
|
|
@ -225,7 +224,18 @@ class RSSPoller:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
html = results.get(link)
|
html = results.get(link)
|
||||||
if not html or not has_article_content(html):
|
if not html:
|
||||||
|
logger.warning("Empty HTML: %s", link[:80])
|
||||||
|
enriched.append(article)
|
||||||
|
continue
|
||||||
|
if is_article_unavailable(html):
|
||||||
|
reason = get_unavailable_reason(html) or "unknown"
|
||||||
|
logger.warning("Article permanently unavailable (%s): %s", reason, link[:80])
|
||||||
|
article["content"] = f"<p>[unavailable] {reason}</p>"
|
||||||
|
article["plain_content"] = f"[unavailable] {reason}"
|
||||||
|
enriched.append(article)
|
||||||
|
continue
|
||||||
|
if not has_article_content(html):
|
||||||
logger.warning("No content in HTML: %s", link[:80])
|
logger.warning("No content in HTML: %s", link[:80])
|
||||||
enriched.append(article)
|
enriched.append(article)
|
||||||
continue
|
continue
|
||||||
|
|
|
||||||
|
|
@ -22,6 +22,7 @@ EVENT_LABELS = {
|
||||||
"login_success": "登录成功",
|
"login_success": "登录成功",
|
||||||
"login_expired": "登录过期",
|
"login_expired": "登录过期",
|
||||||
"verification_required": "触发验证",
|
"verification_required": "触发验证",
|
||||||
|
"content_fetch_failed": "文章内容获取失败",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue