Files
usa/crawler/news_storage.py
2026-03-02 17:20:31 +08:00

142 lines
4.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# -*- coding: utf-8 -*-
"""
资讯内容独立存储,支持历史去重
爬虫拉回数据 → 计算 content_hash → 若已存在则跳过(去重)→ 新数据落库 news_content
"""
import hashlib
import os
import re
import sqlite3
from datetime import datetime, timezone
from typing import List, Optional, Tuple
from config import DB_PATH
def _to_utc_iso(dt: datetime) -> str:
if dt.tzinfo:
dt = dt.astimezone(timezone.utc)
return dt.strftime("%Y-%m-%dT%H:%M:%S.000Z")
def _normalize_for_hash(text: str) -> str:
"""归一化文本用于生成去重 hash"""
if not text:
return ""
t = re.sub(r"\s+", " ", str(text).strip().lower())[:600]
return re.sub(r"[\x00-\x1f]", "", t)
def content_hash(title: str, summary: str, url: str) -> str:
"""根据标题、摘要、URL 生成去重 hash相似内容视为重复"""
raw = _normalize_for_hash(title) + "|" + _normalize_for_hash(summary) + "|" + (url or "").strip()
return hashlib.sha256(raw.encode("utf-8")).hexdigest()[:32]
def _ensure_table(conn: sqlite3.Connection) -> None:
conn.execute("""
CREATE TABLE IF NOT EXISTS news_content (
id TEXT PRIMARY KEY,
content_hash TEXT NOT NULL UNIQUE,
title TEXT NOT NULL,
summary TEXT NOT NULL,
url TEXT NOT NULL DEFAULT '',
source TEXT NOT NULL DEFAULT '',
published_at TEXT NOT NULL,
category TEXT NOT NULL DEFAULT 'other',
severity TEXT NOT NULL DEFAULT 'medium',
created_at TEXT NOT NULL DEFAULT (datetime('now'))
)
""")
try:
conn.execute("CREATE UNIQUE INDEX IF NOT EXISTS idx_news_content_hash ON news_content(content_hash)")
except sqlite3.OperationalError:
pass
try:
conn.execute("CREATE INDEX IF NOT EXISTS idx_news_content_pub ON news_content(published_at DESC)")
except sqlite3.OperationalError:
pass
conn.commit()
def exists_by_hash(conn: sqlite3.Connection, h: str) -> bool:
row = conn.execute("SELECT 1 FROM news_content WHERE content_hash = ? LIMIT 1", (h,)).fetchone()
return row is not None
def insert_news(
conn: sqlite3.Connection,
*,
title: str,
summary: str,
url: str = "",
source: str = "",
published: datetime,
category: str = "other",
severity: str = "medium",
) -> Optional[str]:
"""
插入资讯,若 content_hash 已存在则跳过(去重)
返回: 新插入的 id或 None 表示重复跳过
"""
_ensure_table(conn)
h = content_hash(title, summary, url)
if exists_by_hash(conn, h):
return None
uid = "nc_" + hashlib.sha256(f"{h}{datetime.utcnow().isoformat()}".encode()).hexdigest()[:14]
ts = _to_utc_iso(published)
conn.execute(
"""INSERT INTO news_content (id, content_hash, title, summary, url, source, published_at, category, severity)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)""",
(uid, h, (title or "")[:500], (summary or "")[:2000], (url or "")[:500], (source or "")[:100], ts, category, severity),
)
conn.commit()
return uid
def save_and_dedup(items: List[dict], db_path: Optional[str] = None) -> Tuple[List[dict], int]:
"""
去重后落库 news_content
items: [{"title","summary","url","published","category","severity","source"?}, ...]
返回: (通过去重的新项列表, 实际新增条数)
"""
path = db_path or DB_PATH
if not os.path.exists(path):
return [], 0
conn = sqlite3.connect(path, timeout=10)
try:
_ensure_table(conn)
new_items: List[dict] = []
count = 0
for u in items:
title = (u.get("title") or "")[:500]
summary = (u.get("summary") or u.get("title") or "")[:2000]
url = (u.get("url") or "")[:500]
source = (u.get("source") or "")[:100]
pub = u.get("published")
if isinstance(pub, str):
try:
pub = datetime.fromisoformat(pub.replace("Z", "+00:00"))
except ValueError:
pub = datetime.now(timezone.utc)
elif pub is None:
pub = datetime.now(timezone.utc)
cat = u.get("category", "other")
sev = u.get("severity", "medium")
uid = insert_news(
conn,
title=title,
summary=summary,
url=url,
source=source,
published=pub,
category=cat,
severity=sev,
)
if uid:
count += 1
new_items.append({**u, "news_id": uid})
return new_items, count
finally:
conn.close()