fix: support image-text messages and multi-container article extraction
This commit is contained in:
parent
4825edc355
commit
f9ccb1b2ae
|
|
@ -8,16 +8,21 @@
|
|||
文章路由 - FastAPI版本
|
||||
"""
|
||||
|
||||
import logging
|
||||
import re
|
||||
from typing import Optional, List
|
||||
|
||||
from fastapi import APIRouter, HTTPException, Request
|
||||
from pydantic import BaseModel, Field
|
||||
from typing import Optional, List
|
||||
import re
|
||||
|
||||
from utils.auth_manager import auth_manager
|
||||
from utils.helpers import extract_article_info, parse_article_url
|
||||
from utils.helpers import extract_article_info, parse_article_url, is_image_text_message, has_article_content, get_client_ip
|
||||
from utils.rate_limiter import rate_limiter
|
||||
from utils.webhook import webhook
|
||||
from utils.http_client import fetch_page
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
class ArticleRequest(BaseModel):
|
||||
|
|
@ -56,23 +61,25 @@ async def get_article(article_request: ArticleRequest, request: Request):
|
|||
- `publish_time`: 发布时间戳
|
||||
- `images`: 文章内的图片列表
|
||||
"""
|
||||
client_ip = request.client.host if request.client else "unknown"
|
||||
client_ip = get_client_ip(request)
|
||||
allowed, error_msg = rate_limiter.check_rate_limit(client_ip, "/api/article")
|
||||
if not allowed:
|
||||
return {"success": False, "error": f"⏱️ {error_msg}"}
|
||||
return {"success": False, "error": f"Rate limited: {error_msg}"}
|
||||
|
||||
credentials = auth_manager.get_credentials()
|
||||
if not credentials:
|
||||
return {"success": False, "error": "服务器未登录,请先访问管理页面扫码登录"}
|
||||
|
||||
try:
|
||||
logger.info("[Article] request from %s: %s", client_ip, article_request.url[:80])
|
||||
|
||||
html = await fetch_page(
|
||||
article_request.url,
|
||||
extra_headers={"Referer": "https://mp.weixin.qq.com/"},
|
||||
timeout=120 # WeChat 大文章可能超时,延长至 120 秒
|
||||
timeout=120
|
||||
)
|
||||
|
||||
if "js_content" not in html:
|
||||
if not has_article_content(html):
|
||||
if "verify" in html or "验证" in html or "环境异常" in html:
|
||||
await webhook.notify('verification_required', {
|
||||
'url': article_request.url,
|
||||
|
|
|
|||
|
|
@ -70,12 +70,12 @@ async def _fetch_via_proxy(
|
|||
timeout=timeout
|
||||
)
|
||||
|
||||
# 验证内容有效性: 只检查 js_content 是否存在
|
||||
if "js_content" in html:
|
||||
from utils.helpers import has_article_content
|
||||
if has_article_content(html):
|
||||
logger.info("[Proxy] len=%d", len(html))
|
||||
return html
|
||||
else:
|
||||
logger.warning("[Proxy] invalid content (len=%d, has_js_content=False)", len(html))
|
||||
logger.warning("[Proxy] invalid content (len=%d, no known content marker)", len(html))
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
|
|
|
|||
|
|
@ -97,17 +97,43 @@ def _extract_div_inner(html: str, open_tag_pattern: str) -> str:
|
|||
|
||||
def extract_content(html: str) -> str:
|
||||
"""
|
||||
Extract article body from the js_content div, handling nested divs.
|
||||
Extract article body, trying multiple container patterns.
|
||||
Different WeChat account types (government, media, personal) use
|
||||
different HTML structures. We try them in order of specificity.
|
||||
For image-text messages (item_show_type=8), delegates to helpers.
|
||||
"""
|
||||
from utils.helpers import is_image_text_message, _extract_image_text_content
|
||||
|
||||
if is_image_text_message(html):
|
||||
result = _extract_image_text_content(html)
|
||||
return result.get('content', '')
|
||||
|
||||
# Pattern 1: id="js_content" (most common)
|
||||
content = _extract_div_inner(html, r'<div[^>]*\bid=["\']js_content["\'][^>]*>')
|
||||
if content:
|
||||
return content
|
||||
|
||||
# Pattern 2: class contains rich_media_content
|
||||
content = _extract_div_inner(html, r'<div[^>]*\bclass=["\'][^"\']*rich_media_content[^"\']*["\'][^>]*>')
|
||||
if content:
|
||||
return content
|
||||
|
||||
logger.warning("Failed to extract article body")
|
||||
# Pattern 3: id="page-content" (government/institutional accounts)
|
||||
content = _extract_div_inner(html, r'<div[^>]*\bid=["\']page-content["\'][^>]*>')
|
||||
if content:
|
||||
return content
|
||||
|
||||
# Pattern 4: class contains rich_media_area_primary_inner
|
||||
content = _extract_div_inner(html, r'<div[^>]*\bclass=["\'][^"\']*rich_media_area_primary_inner[^"\']*["\'][^>]*>')
|
||||
if content:
|
||||
return content
|
||||
|
||||
# Pattern 5: id="js_article" (alternative article container)
|
||||
content = _extract_div_inner(html, r'<div[^>]*\bid=["\']js_article["\'][^>]*>')
|
||||
if content:
|
||||
return content
|
||||
|
||||
logger.warning("Failed to extract article body from any known container")
|
||||
return ""
|
||||
|
||||
|
||||
|
|
|
|||
276
utils/helpers.py
276
utils/helpers.py
|
|
@ -62,31 +62,169 @@ def parse_article_url(url: str) -> Optional[Dict[str, str]]:
|
|||
except Exception:
|
||||
return None
|
||||
|
||||
def is_image_text_message(html: str) -> bool:
|
||||
"""检测是否为图文消息(item_show_type=8,类似小红书多图+文字)"""
|
||||
m = re.search(r"window\.item_show_type\s*=\s*'(\d+)'", html)
|
||||
return m is not None and m.group(1) == '8'
|
||||
|
||||
|
||||
def _extract_image_text_content(html: str) -> Dict:
|
||||
"""
|
||||
提取图文消息的内容(item_show_type=8)
|
||||
|
||||
图文消息的结构与普通文章完全不同:
|
||||
- 图片在 picture_page_info_list 的 JsDecode() 中
|
||||
- 文字在 meta description 或 content_desc 中
|
||||
- 没有 #js_content div
|
||||
"""
|
||||
import html as html_module
|
||||
|
||||
# 提取图片 URL(从 picture_page_info_list 中的 cdn_url)
|
||||
# 页面中有两种格式:
|
||||
# 1. picture_page_info_list: [ { cdn_url: JsDecode('...'), ... } ] (带JsDecode)
|
||||
# 2. picture_page_info_list = [ { width:..., height:..., cdn_url: '...' } ] (简单格式)
|
||||
# 每个 item 中第一个 cdn_url 是主图,watermark_info 内的是水印,需要跳过
|
||||
images = []
|
||||
|
||||
# 优先使用简单格式(第二种),更易解析且包含所有图片
|
||||
simple_list_pos = html.find('picture_page_info_list = [')
|
||||
if simple_list_pos >= 0:
|
||||
bracket_start = html.find('[', simple_list_pos)
|
||||
depth = 0
|
||||
end = bracket_start
|
||||
for end in range(bracket_start, min(bracket_start + 20000, len(html))):
|
||||
if html[end] == '[':
|
||||
depth += 1
|
||||
elif html[end] == ']':
|
||||
depth -= 1
|
||||
if depth == 0:
|
||||
break
|
||||
block = html[bracket_start:end + 1]
|
||||
# 按顶层 { 分割,每个 item 取第一个 cdn_url(主图)
|
||||
items = re.split(r'\n\s{4,10}\{', block)
|
||||
for item in items:
|
||||
m = re.search(r"cdn_url:\s*'([^']+)'", item)
|
||||
if m:
|
||||
url = m.group(1)
|
||||
if url not in images and ('mmbiz.qpic.cn' in url or 'mmbiz.qlogo.cn' in url):
|
||||
images.append(url)
|
||||
|
||||
# 降级: 使用 JsDecode 格式
|
||||
if not images:
|
||||
jsdecode_list_match = re.search(
|
||||
r'picture_page_info_list:\s*\[', html
|
||||
)
|
||||
if jsdecode_list_match:
|
||||
block_start = jsdecode_list_match.end() - 1
|
||||
depth = 0
|
||||
end = block_start
|
||||
for end in range(block_start, min(block_start + 20000, len(html))):
|
||||
if html[end] == '[':
|
||||
depth += 1
|
||||
elif html[end] == ']':
|
||||
depth -= 1
|
||||
if depth == 0:
|
||||
break
|
||||
block = html[block_start:end + 1]
|
||||
# 按顶层 { 分割
|
||||
items = re.split(r'\n\s{10,30}\{(?=\s*\n\s*cdn_url)', block)
|
||||
for item in items:
|
||||
m = re.search(r"cdn_url:\s*JsDecode\('([^']+)'\)", item)
|
||||
if m:
|
||||
url = m.group(1).replace('\\x26amp;', '&').replace('\\x26', '&')
|
||||
if url not in images and ('mmbiz.qpic.cn' in url or 'mmbiz.qlogo.cn' in url):
|
||||
images.append(url)
|
||||
|
||||
# 提取文字描述
|
||||
desc = ''
|
||||
# 方法1: meta description
|
||||
desc_match = re.search(r'<meta\s+name="description"\s+content="([^"]*)"', html)
|
||||
if desc_match:
|
||||
desc = desc_match.group(1)
|
||||
# 处理 \x26 编码(微信的双重编码:\x26lt; -> < -> <)
|
||||
desc = re.sub(r'\\x([0-9a-fA-F]{2})', lambda m: chr(int(m.group(1), 16)), desc)
|
||||
desc = html_module.unescape(desc)
|
||||
# 二次 unescape 处理双重编码
|
||||
desc = html_module.unescape(desc)
|
||||
# 清理 HTML 标签残留
|
||||
desc = re.sub(r'<[^>]+>', '', desc)
|
||||
desc = desc.replace('\\x0a', '\n').replace('\\n', '\n')
|
||||
|
||||
# 方法2: content_desc
|
||||
if not desc:
|
||||
desc_match2 = re.search(r"content_desc:\s*JsDecode\('([^']*)'\)", html)
|
||||
if desc_match2:
|
||||
desc = desc_match2.group(1)
|
||||
desc = html_module.unescape(desc)
|
||||
|
||||
# 构建 HTML 内容:竖向画廊 + 文字(RSS 兼容)
|
||||
html_parts = []
|
||||
|
||||
# 竖向画廊:每张图限宽,紧凑排列,兼容主流 RSS 阅读器
|
||||
if images:
|
||||
gallery_imgs = []
|
||||
for i, img_url in enumerate(images):
|
||||
gallery_imgs.append(
|
||||
f'<p style="text-align:center;margin:0 0 6px">'
|
||||
f'<img src="{img_url}" data-src="{img_url}" '
|
||||
f'style="max-width:480px;width:100%;height:auto;border-radius:4px" />'
|
||||
f'</p>'
|
||||
)
|
||||
gallery_imgs.append(
|
||||
f'<p style="text-align:center;color:#999;font-size:12px;margin:4px 0 0">'
|
||||
f'{len(images)} images'
|
||||
f'</p>'
|
||||
)
|
||||
html_parts.append('\n'.join(gallery_imgs))
|
||||
|
||||
# 文字描述区域
|
||||
if desc:
|
||||
text_lines = []
|
||||
for line in desc.split('\n'):
|
||||
line = line.strip()
|
||||
if line:
|
||||
text_lines.append(
|
||||
f'<p style="margin:0 0 8px;line-height:1.8;font-size:15px;color:#333">{line}</p>'
|
||||
)
|
||||
html_parts.append('\n'.join(text_lines))
|
||||
|
||||
content = '\n'.join(html_parts)
|
||||
plain_content = desc if desc else ''
|
||||
|
||||
return {
|
||||
'content': content,
|
||||
'plain_content': plain_content,
|
||||
'images': images,
|
||||
}
|
||||
|
||||
|
||||
def extract_article_info(html: str, params: Optional[Dict] = None) -> Dict:
|
||||
"""
|
||||
从HTML中提取文章信息
|
||||
|
||||
|
||||
Args:
|
||||
html: 文章HTML内容
|
||||
params: URL参数(可选,用于返回__biz等信息)
|
||||
|
||||
|
||||
Returns:
|
||||
文章信息字典
|
||||
"""
|
||||
|
||||
|
||||
title = ''
|
||||
# 图文消息的标题通常在 window.msg_title 中
|
||||
title_match = (
|
||||
re.search(r'<h1[^>]*class=[^>]*rich_media_title[^>]*>([\s\S]*?)</h1>', html, re.IGNORECASE) or
|
||||
re.search(r'<h2[^>]*class=[^>]*rich_media_title[^>]*>([\s\S]*?)</h2>', html, re.IGNORECASE) or
|
||||
re.search(r"var\s+msg_title\s*=\s*'([^']+)'\.html\(false\)", html) or
|
||||
re.search(r"window\.msg_title\s*=\s*window\.title\s*=\s*'([^']*)'", html) or
|
||||
re.search(r'<meta\s+property="og:title"\s+content="([^"]+)"', html)
|
||||
)
|
||||
|
||||
|
||||
if title_match:
|
||||
title = title_match.group(1)
|
||||
title = re.sub(r'<[^>]+>', '', title)
|
||||
title = title.replace('"', '"').replace('&', '&').strip()
|
||||
|
||||
|
||||
author = ''
|
||||
author_match = (
|
||||
re.search(r'<a[^>]*id="js_name"[^>]*>([\s\S]*?)</a>', html, re.IGNORECASE) or
|
||||
|
|
@ -94,72 +232,81 @@ def extract_article_info(html: str, params: Optional[Dict] = None) -> Dict:
|
|||
re.search(r'<meta\s+property="og:article:author"\s+content="([^"]+)"', html) or
|
||||
re.search(r'<a[^>]*class=[^>]*rich_media_meta_nickname[^>]*>([^<]+)</a>', html, re.IGNORECASE)
|
||||
)
|
||||
|
||||
|
||||
if author_match:
|
||||
author = author_match.group(1)
|
||||
author = re.sub(r'<[^>]+>', '', author).strip()
|
||||
|
||||
|
||||
publish_time = 0
|
||||
time_match = (
|
||||
re.search(r'var\s+publish_time\s*=\s*"(\d+)"', html) or
|
||||
re.search(r'var\s+ct\s*=\s*"(\d+)"', html) or
|
||||
re.search(r"var\s+ct\s*=\s*'(\d+)'", html) or
|
||||
re.search(r'<em[^>]*id="publish_time"[^>]*>([^<]+)</em>', html)
|
||||
)
|
||||
|
||||
|
||||
if time_match:
|
||||
try:
|
||||
publish_time = int(time_match.group(1))
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
|
||||
content = ''
|
||||
images = []
|
||||
|
||||
# 方法1: 匹配 id="js_content"
|
||||
content_match = re.search(r'<div[^>]*id="js_content"[^>]*>([\s\S]*?)<script[^>]*>[\s\S]*?</script>', html, re.IGNORECASE)
|
||||
|
||||
if not content_match:
|
||||
# 方法2: 匹配 class包含rich_media_content
|
||||
content_match = re.search(r'<div[^>]*class="[^"]*rich_media_content[^"]*"[^>]*>([\s\S]*?)</div>', html, re.IGNORECASE)
|
||||
|
||||
if content_match and content_match.group(1):
|
||||
content = content_match.group(1).strip()
|
||||
|
||||
# 检测是否为图文消息(item_show_type=8)
|
||||
if is_image_text_message(html):
|
||||
img_text_data = _extract_image_text_content(html)
|
||||
content = img_text_data['content']
|
||||
images = img_text_data['images']
|
||||
plain_content = img_text_data['plain_content']
|
||||
else:
|
||||
# 方法3: 手动截取
|
||||
js_content_pos = html.find('id="js_content"')
|
||||
if js_content_pos > 0:
|
||||
start = html.find('>', js_content_pos) + 1
|
||||
script_pos = html.find('<script', start)
|
||||
if script_pos > start:
|
||||
content = html[start:script_pos].strip()
|
||||
if content:
|
||||
# 提取data-src属性
|
||||
img_regex = re.compile(r'<img[^>]+data-src="([^"]+)"')
|
||||
for img_match in img_regex.finditer(content):
|
||||
img_url = img_match.group(1)
|
||||
if img_url not in images:
|
||||
images.append(img_url)
|
||||
|
||||
# 提取src属性
|
||||
img_regex2 = re.compile(r'<img[^>]+src="([^"]+)"')
|
||||
for img_match in img_regex2.finditer(content):
|
||||
img_url = img_match.group(1)
|
||||
if not img_url.startswith('data:') and img_url not in images:
|
||||
images.append(img_url)
|
||||
|
||||
content = re.sub(r'<script[^>]*>[\s\S]*?</script>', '', content, flags=re.IGNORECASE)
|
||||
|
||||
content = ''
|
||||
images = []
|
||||
|
||||
# 方法1: 匹配 id="js_content"
|
||||
content_match = re.search(r'<div[^>]*id="js_content"[^>]*>([\s\S]*?)<script[^>]*>[\s\S]*?</script>', html, re.IGNORECASE)
|
||||
|
||||
if not content_match:
|
||||
# 方法2: 匹配 class包含rich_media_content
|
||||
content_match = re.search(r'<div[^>]*class="[^"]*rich_media_content[^"]*"[^>]*>([\s\S]*?)</div>', html, re.IGNORECASE)
|
||||
|
||||
if content_match and content_match.group(1):
|
||||
content = content_match.group(1).strip()
|
||||
else:
|
||||
# 方法3: 手动截取
|
||||
js_content_pos = html.find('id="js_content"')
|
||||
if js_content_pos > 0:
|
||||
start = html.find('>', js_content_pos) + 1
|
||||
script_pos = html.find('<script', start)
|
||||
if script_pos > start:
|
||||
content = html[start:script_pos].strip()
|
||||
if content:
|
||||
# 提取data-src属性
|
||||
img_regex = re.compile(r'<img[^>]+data-src="([^"]+)"')
|
||||
for img_match in img_regex.finditer(content):
|
||||
img_url = img_match.group(1)
|
||||
if img_url not in images:
|
||||
images.append(img_url)
|
||||
|
||||
# 提取src属性
|
||||
img_regex2 = re.compile(r'<img[^>]+src="([^"]+)"')
|
||||
for img_match in img_regex2.finditer(content):
|
||||
img_url = img_match.group(1)
|
||||
if not img_url.startswith('data:') and img_url not in images:
|
||||
images.append(img_url)
|
||||
|
||||
content = re.sub(r'<script[^>]*>[\s\S]*?</script>', '', content, flags=re.IGNORECASE)
|
||||
plain_content = html_to_text(content) if content else ''
|
||||
|
||||
__biz = params.get('__biz', 'unknown') if params else 'unknown'
|
||||
publish_time_str = ''
|
||||
if publish_time > 0:
|
||||
from datetime import datetime
|
||||
dt = datetime.fromtimestamp(publish_time)
|
||||
publish_time_str = dt.strftime('%Y-%m-%d %H:%M:%S')
|
||||
|
||||
|
||||
return {
|
||||
'title': title,
|
||||
'content': content,
|
||||
'plain_content': html_to_text(content) if content else '',
|
||||
'plain_content': plain_content,
|
||||
'images': images,
|
||||
'author': author,
|
||||
'publish_time': publish_time,
|
||||
|
|
@ -167,14 +314,45 @@ def extract_article_info(html: str, params: Optional[Dict] = None) -> Dict:
|
|||
'__biz': __biz
|
||||
}
|
||||
|
||||
def has_article_content(html: str) -> bool:
|
||||
"""
|
||||
Check whether the fetched HTML likely contains article content.
|
||||
Different WeChat account types use different content containers.
|
||||
"""
|
||||
content_markers = [
|
||||
"js_content",
|
||||
"rich_media_content",
|
||||
"rich_media_area_primary",
|
||||
"page-content",
|
||||
"page_content",
|
||||
]
|
||||
if any(marker in html for marker in content_markers):
|
||||
return True
|
||||
return is_image_text_message(html)
|
||||
|
||||
|
||||
def get_client_ip(request) -> str:
|
||||
"""
|
||||
Extract real client IP from request, respecting reverse proxy headers.
|
||||
Priority: X-Forwarded-For > X-Real-IP > request.client.host
|
||||
"""
|
||||
forwarded_for = request.headers.get("x-forwarded-for", "")
|
||||
if forwarded_for:
|
||||
return forwarded_for.split(",")[0].strip()
|
||||
real_ip = request.headers.get("x-real-ip", "")
|
||||
if real_ip:
|
||||
return real_ip.strip()
|
||||
return request.client.host if request.client else "unknown"
|
||||
|
||||
|
||||
def is_article_deleted(html: str) -> bool:
|
||||
"""检查文章是否被删除"""
|
||||
return '已删除' in html or 'deleted' in html.lower()
|
||||
|
||||
def is_need_verification(html: str) -> bool:
|
||||
"""检查是否需要验证"""
|
||||
return ('verify' in html.lower() or
|
||||
'验证' in html or
|
||||
return ('verify' in html.lower() or
|
||||
'验证' in html or
|
||||
'环境异常' in html)
|
||||
|
||||
def is_login_required(html: str) -> bool:
|
||||
|
|
|
|||
|
|
@ -20,7 +20,7 @@ import httpx
|
|||
|
||||
from utils.auth_manager import auth_manager
|
||||
from utils import rss_store
|
||||
from utils.helpers import extract_article_info, parse_article_url
|
||||
from utils.helpers import extract_article_info, parse_article_url, is_image_text_message, has_article_content
|
||||
from utils.http_client import fetch_page
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
|
@ -225,8 +225,8 @@ class RSSPoller:
|
|||
continue
|
||||
|
||||
html = results.get(link)
|
||||
if not html or "js_content" not in html:
|
||||
logger.warning("❌ No content in HTML: %s", link[:80])
|
||||
if not html or not has_article_content(html):
|
||||
logger.warning("No content in HTML: %s", link[:80])
|
||||
enriched.append(article)
|
||||
continue
|
||||
|
||||
|
|
@ -246,7 +246,7 @@ class RSSPoller:
|
|||
article_info = extract_article_info(html, parse_article_url(link))
|
||||
article["author"] = article_info.get("author", "")
|
||||
|
||||
logger.info("✅ Content fetched: %s... (%d chars, %d images)",
|
||||
logger.info("Content fetched: %s... (%d chars, %d images)",
|
||||
link[:50],
|
||||
len(article["content"]),
|
||||
len(result.get("images", [])))
|
||||
|
|
|
|||
|
|
@ -133,17 +133,29 @@ def update_last_poll(fakeid: str):
|
|||
# ── 文章缓存 ─────────────────────────────────────────────
|
||||
|
||||
def save_articles(fakeid: str, articles: List[Dict]) -> int:
|
||||
"""批量保存文章,返回新增数量"""
|
||||
"""
|
||||
批量保存文章,返回新增数量。
|
||||
If an article already exists but has empty content, update it with new content.
|
||||
"""
|
||||
conn = _get_conn()
|
||||
inserted = 0
|
||||
try:
|
||||
for a in articles:
|
||||
content = a.get("content", "")
|
||||
plain_content = a.get("plain_content", "")
|
||||
try:
|
||||
conn.execute(
|
||||
"INSERT OR IGNORE INTO articles "
|
||||
cursor = conn.execute(
|
||||
"INSERT INTO articles "
|
||||
"(fakeid, aid, title, link, digest, cover, author, "
|
||||
"content, plain_content, publish_time, fetched_at) "
|
||||
"VALUES (?,?,?,?,?,?,?,?,?,?,?)",
|
||||
"VALUES (?,?,?,?,?,?,?,?,?,?,?) "
|
||||
"ON CONFLICT(fakeid, link) DO UPDATE SET "
|
||||
"content = CASE WHEN excluded.content != '' AND articles.content = '' "
|
||||
" THEN excluded.content ELSE articles.content END, "
|
||||
"plain_content = CASE WHEN excluded.plain_content != '' AND articles.plain_content = '' "
|
||||
" THEN excluded.plain_content ELSE articles.plain_content END, "
|
||||
"author = CASE WHEN excluded.author != '' AND articles.author = '' "
|
||||
" THEN excluded.author ELSE articles.author END",
|
||||
(
|
||||
fakeid,
|
||||
a.get("aid", ""),
|
||||
|
|
@ -152,13 +164,13 @@ def save_articles(fakeid: str, articles: List[Dict]) -> int:
|
|||
a.get("digest", ""),
|
||||
a.get("cover", ""),
|
||||
a.get("author", ""),
|
||||
a.get("content", ""),
|
||||
a.get("plain_content", ""),
|
||||
content,
|
||||
plain_content,
|
||||
a.get("publish_time", 0),
|
||||
int(time.time()),
|
||||
),
|
||||
)
|
||||
if conn.total_changes:
|
||||
if cursor.rowcount > 0:
|
||||
inserted += 1
|
||||
except sqlite3.IntegrityError:
|
||||
pass
|
||||
|
|
|
|||
Loading…
Reference in New Issue