fix: 优化核心包

This commit is contained in:
Daniel
2026-04-07 20:15:19 +08:00
parent 84f8be7c0e
commit 6220c5d6c5
4 changed files with 344 additions and 44 deletions

View File

@@ -1,6 +1,6 @@
from __future__ import annotations from __future__ import annotations
from datetime import datetime from datetime import datetime, timedelta, timezone
from enum import Enum from enum import Enum
from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
@@ -46,6 +46,16 @@ class JobCard(BaseModel):
tags: list[str] = Field(default_factory=list, description="业务标签列表") tags: list[str] = Field(default_factory=list, description="业务标签列表")
confidence: float = Field(ge=0, le=1, description="数据置信度,范围 0~1") confidence: float = Field(ge=0, le=1, description="数据置信度,范围 0~1")
@field_validator("start_time", mode="after")
@classmethod
def normalize_start_time(cls, value: datetime) -> datetime:
shanghai_tz = timezone(timedelta(hours=8))
if value.tzinfo is None:
value = value.replace(tzinfo=shanghai_tz)
else:
value = value.astimezone(shanghai_tz)
return value.replace(second=0, microsecond=0)
class WorkerCard(BaseModel): class WorkerCard(BaseModel):
worker_id: str = Field(description="工人唯一 ID") worker_id: str = Field(description="工人唯一 ID")

View File

@@ -2,6 +2,7 @@ from __future__ import annotations
import json import json
import re import re
from collections import Counter
from datetime import datetime, timedelta, timezone from datetime import datetime, timedelta, timezone
from pathlib import Path from pathlib import Path
@@ -22,7 +23,19 @@ class ExtractionService:
self.skills = json.loads((self.settings.sample_data_dir / "skills.json").read_text(encoding="utf-8")) self.skills = json.loads((self.settings.sample_data_dir / "skills.json").read_text(encoding="utf-8"))
self.categories = json.loads((self.settings.sample_data_dir / "categories.json").read_text(encoding="utf-8")) self.categories = json.loads((self.settings.sample_data_dir / "categories.json").read_text(encoding="utf-8"))
self.regions = json.loads((self.settings.sample_data_dir / "regions.json").read_text(encoding="utf-8")) self.regions = json.loads((self.settings.sample_data_dir / "regions.json").read_text(encoding="utf-8"))
self.sample_jobs = json.loads((self.settings.sample_data_dir / "jobs.json").read_text(encoding="utf-8"))
self.sample_workers = json.loads((self.settings.sample_data_dir / "workers.json").read_text(encoding="utf-8"))
self.default_region = self._build_default_region()
self.default_category = self._build_default_category()
self.default_salary_amount = self._build_default_salary_amount()
self.default_job_tags = self._build_default_job_tags()
self.default_worker_skills = self._build_default_worker_skills()
self.default_experience_tags = self._build_default_experience_tags()
self.category_skill_defaults = self._build_category_skill_defaults()
self.city_region_defaults = self._build_city_region_defaults()
self.tag_candidates = self._build_tag_candidates()
self.llm_client = LLMClient(self.settings) self.llm_client = LLMClient(self.settings)
self.shanghai_tz = timezone(timedelta(hours=8))
def extract_job(self, text: str) -> ExtractResponse: def extract_job(self, text: str) -> ExtractResponse:
logger.info("extract_job request text=%s", text) logger.info("extract_job request text=%s", text)
@@ -110,12 +123,12 @@ class ExtractionService:
def _extract_job_rule(self, text: str) -> JobCard: def _extract_job_rule(self, text: str) -> JobCard:
skill_hits = [item for item in self.skills if item in text] skill_hits = [item for item in self.skills if item in text]
category = next((item for item in self.categories if item in text), "活动执行") category = next((item for item in self.categories if item in text), self.default_category)
region = self._extract_region(text) region = self._extract_region(text)
salary = self._extract_salary(text) salary = self._extract_salary(text)
headcount = self._extract_number(text, [r"(\d+)\s*[个名人位]"], default=1) headcount = self._extract_number(text, [r"(\d+)\s*[个名人位]"], default=1)
duration = self._extract_number(text, [r"(\d+(?:\.\d+)?)\s*小时"], default=4.0, cast=float) duration = self._extract_number(text, [r"(\d+(?:\.\d+)?)\s*小时"], default=4.0, cast=float)
tags = [tag for tag in ["女生优先", "男生优先", "有经验优先", "沟通好", "可连做优先"] if tag in text] tags = [tag for tag in self.tag_candidates if tag in text][:3]
title = next((f"{category}{skill_hits[0]}兼职" for _ in [0] if skill_hits), f"{category}兼职") title = next((f"{category}{skill_hits[0]}兼职" for _ in [0] if skill_hits), f"{category}兼职")
card = JobCard( card = JobCard(
job_id=generate_id("job"), job_id=generate_id("job"),
@@ -131,7 +144,7 @@ class ExtractionService:
headcount=int(headcount), headcount=int(headcount),
salary=salary, salary=salary,
work_mode="排班制" if "排班" in text else "兼职", work_mode="排班制" if "排班" in text else "兼职",
tags=tags or ["有经验优先"], tags=tags or self.default_job_tags,
confidence=self._compute_confidence(skill_hits, region, salary.amount > 0), confidence=self._compute_confidence(skill_hits, region, salary.amount > 0),
) )
return card return card
@@ -139,19 +152,29 @@ class ExtractionService:
def _extract_worker_rule(self, text: str) -> WorkerCard: def _extract_worker_rule(self, text: str) -> WorkerCard:
skill_hits = [item for item in self.skills if item in text][:6] skill_hits = [item for item in self.skills if item in text][:6]
region_hits = [item for item in self.regions if item["region"] in text or item["city"] in text] region_hits = [item for item in self.regions if item["region"] in text or item["city"] in text]
city_names = list(dict.fromkeys([item["city"] for item in region_hits])) or ["深圳"] if not region_hits:
region_names = list(dict.fromkeys([item["region"] for item in region_hits])) or ["南山"] city_hits = [item["city"] for item in self.regions if item["city"] in text]
unique_city_hits = list(dict.fromkeys(city_hits))
region_hits = [
{"city": city, "region": self.city_region_defaults.get(city, self.default_region["region"])}
for city in unique_city_hits
]
city_names = list(dict.fromkeys([item["city"] for item in region_hits])) or [self.default_region["city"]]
region_names = list(dict.fromkeys([item["region"] for item in region_hits])) or [self.default_region["region"]]
availability = self._extract_availability(text) availability = self._extract_availability(text)
experience = [item for item in ["商场", "会展", "活动执行", "物流", "零售", "客服中心", "快消", "校园推广"] if item in text] experience = [item for item in self.default_experience_tags if item in text]
card = WorkerCard( card = WorkerCard(
worker_id=generate_id("worker"), worker_id=generate_id("worker"),
name=self._extract_name(text), name=self._extract_name(text),
description=text, description=text,
skills=[SkillScore(name=item, score=round(0.72 + index * 0.04, 2)) for index, item in enumerate(skill_hits or ["活动执行", "引导", "登记"])], skills=[
SkillScore(name=item, score=round(0.72 + index * 0.04, 2))
for index, item in enumerate(skill_hits or self.default_worker_skills)
],
cities=city_names, cities=city_names,
regions=region_names, regions=region_names,
availability=availability, availability=availability,
experience_tags=experience or ["活动执行"], experience_tags=experience or self.default_experience_tags[:2],
reliability_score=0.76, reliability_score=0.76,
profile_completion=0.68, profile_completion=0.68,
confidence=self._compute_confidence(skill_hits, {"city": city_names[0], "region": region_names[0]}, True), confidence=self._compute_confidence(skill_hits, {"city": city_names[0], "region": region_names[0]}, True),
@@ -165,7 +188,10 @@ class ExtractionService:
for item in self.regions: for item in self.regions:
if item["region"] in text: if item["region"] in text:
return item return item
return {"city": "深圳", "region": "南山"} city_match = next((item["city"] for item in self.regions if item["city"] in text), "")
if city_match:
return {"city": city_match, "region": self.city_region_defaults.get(city_match, self.default_region["region"])}
return self.default_region
def _extract_location(self, text: str, region: dict) -> str: def _extract_location(self, text: str, region: dict) -> str:
markers = ["会展中心", "商场", "地铁站", "园区", "写字楼", "仓库", "门店"] markers = ["会展中心", "商场", "地铁站", "园区", "写字楼", "仓库", "门店"]
@@ -175,7 +201,7 @@ class ExtractionService:
return f"{region['city']}{region['region']}待定点位" return f"{region['city']}{region['region']}待定点位"
def _extract_salary(self, text: str) -> Salary: def _extract_salary(self, text: str) -> Salary:
amount = self._extract_number(text, [r"(\d+(?:\.\d+)?)\s*(?:元|块)"], default=150.0, cast=float) amount = self._extract_number(text, [r"(\d+(?:\.\d+)?)\s*(?:元|块)"], default=self.default_salary_amount, cast=float)
salary_type = "hourly" if "小时" in text and "/小时" in text else "daily" salary_type = "hourly" if "小时" in text and "/小时" in text else "daily"
return Salary(type=salary_type, amount=amount, currency="CNY") return Salary(type=salary_type, amount=amount, currency="CNY")
@@ -187,28 +213,72 @@ class ExtractionService:
return default return default
def _extract_job_time(self, text: str) -> datetime: def _extract_job_time(self, text: str) -> datetime:
shanghai_tz = timezone(timedelta(hours=8)) now = datetime.now(self.shanghai_tz)
now = datetime.now(shanghai_tz) for candidate in self._time_candidates(text, now):
parsed = self._parse_datetime(candidate, now)
if parsed:
return parsed
return self._normalize_datetime(now + timedelta(days=1))
def _time_candidates(self, text: str, now: datetime) -> list[str]:
candidates = [text]
if any(token in text for token in ("今天", "今日")):
candidates.append(text.replace("今日", now.strftime("%Y-%m-%d")).replace("今天", now.strftime("%Y-%m-%d")))
if "明天" in text: if "明天" in text:
base = now + timedelta(days=1) tomorrow = now + timedelta(days=1)
elif "" in text: candidates.append(text.replace("", tomorrow.strftime("%Y-%m-%d")))
base = now + timedelta(days=2) if "后天" in text:
day_after = now + timedelta(days=2)
candidates.append(text.replace("后天", day_after.strftime("%Y-%m-%d")))
weekday_map = {"": 0, "": 1, "": 2, "": 3, "": 4, "": 5, "": 6, "": 6}
week_match = re.search(r"(下周|本周|这周|周)([一二三四五六日天])", text)
if week_match:
week_token, weekday_token = week_match.groups()
target_weekday = weekday_map[weekday_token]
days_ahead = (target_weekday - now.weekday()) % 7
if week_token == "下周":
days_ahead = days_ahead + 7
elif week_token == "" and days_ahead == 0:
days_ahead = 7
target_day = now + timedelta(days=days_ahead)
candidates.append(text.replace(week_match.group(0), target_day.strftime("%Y-%m-%d")))
return candidates
def _parse_datetime(self, text: str, now: datetime) -> datetime | None:
normalized = self._replace_time_words(text)
cleaned = re.sub(r"[,、。;,;]", " ", normalized)
cleaned = cleaned.replace("", "")
cleaned = re.sub(r"(\d{1,2})月(\d{1,2})日", rf"{now.year}-\1-\2", cleaned)
cleaned = re.sub(r"(\d{1,2})点半", r"\1:30", cleaned)
cleaned = re.sub(r"(\d{1,2})点", r"\1:00", cleaned)
cleaned = re.sub(r"(\d{1,2})时", r"\1:00", cleaned)
has_date = bool(re.search(r"\d{4}-\d{1,2}-\d{1,2}", cleaned))
if not has_date:
return None
try:
parsed = date_parser.parse(cleaned, fuzzy=True)
except Exception:
return None
return self._normalize_datetime(parsed)
def _replace_time_words(self, text: str) -> str:
replaced = text
replaced = re.sub(r"(今晚|晚上)", " 19:00 ", replaced)
replaced = re.sub(r"(下午)", " 14:00 ", replaced)
replaced = re.sub(r"(中午)", " 12:00 ", replaced)
replaced = re.sub(r"(早上|上午)", " 09:00 ", replaced)
replaced = re.sub(r"(凌晨)", " 01:00 ", replaced)
return replaced
def _normalize_datetime(self, value: datetime) -> datetime:
if value.tzinfo is None:
value = value.replace(tzinfo=self.shanghai_tz)
else: else:
month_day = re.search(r"(\d{1,2})月(\d{1,2})日", text) value = value.astimezone(self.shanghai_tz)
if month_day: return value.replace(second=0, microsecond=0)
month, day = int(month_day.group(1)), int(month_day.group(2))
base = now.replace(month=month, day=day)
else:
base = now + timedelta(days=1)
hour = 9
if "下午" in text:
hour = 13
elif "晚上" in text:
hour = 19
explicit_hour = re.search(r"(\d{1,2})[:点时](\d{0,2})?", text)
if explicit_hour:
hour = int(explicit_hour.group(1))
return base.replace(hour=hour, minute=0, second=0, microsecond=0)
def _extract_availability(self, text: str) -> list[str]: def _extract_availability(self, text: str) -> list[str]:
tags = [] tags = []
@@ -230,13 +300,10 @@ class ExtractionService:
return "匿名候选人" return "匿名候选人"
def _guess_category_skills(self, category: str) -> list[str]: def _guess_category_skills(self, category: str) -> list[str]:
mapping = { skills = self.category_skill_defaults.get(category)
"活动执行": ["签到", "引导", "登记"], if skills:
"促销": ["促销", "导购", "陈列"], return skills
"配送": ["配送", "装卸", "司机协助"], return self.default_worker_skills[:3]
"客服": ["客服", "电话邀约", "线上客服"],
}
return mapping.get(category, ["活动执行", "沟通"])
def _compute_confidence(self, skill_hits: list[str], region: dict, has_salary: bool) -> float: def _compute_confidence(self, skill_hits: list[str], region: dict, has_salary: bool) -> float:
score = 0.55 score = 0.55
@@ -250,3 +317,109 @@ class ExtractionService:
def _missing_fields(self, exc: ValidationError) -> list[str]: def _missing_fields(self, exc: ValidationError) -> list[str]:
return [".".join(str(part) for part in item["loc"]) for item in exc.errors()] return [".".join(str(part) for part in item["loc"]) for item in exc.errors()]
def _build_default_region(self) -> dict:
if self.sample_jobs:
pair_counter = Counter(
(item.get("city"), item.get("region"))
for item in self.sample_jobs
if item.get("city") and item.get("region")
)
if pair_counter:
city, region = pair_counter.most_common(1)[0][0]
return {"city": city, "region": region}
if self.regions:
return {"city": self.regions[0]["city"], "region": self.regions[0]["region"]}
return {"city": "深圳", "region": "南山"}
def _build_default_category(self) -> str:
counter = Counter(item.get("category") for item in self.sample_jobs if item.get("category"))
if counter:
return counter.most_common(1)[0][0]
return self.categories[0] if self.categories else "活动执行"
def _build_default_salary_amount(self) -> float:
amounts = sorted(
float(item["salary"]["amount"])
for item in self.sample_jobs
if isinstance(item.get("salary"), dict) and isinstance(item["salary"].get("amount"), (int, float))
)
if not amounts:
return 150.0
mid = len(amounts) // 2
if len(amounts) % 2 == 1:
return amounts[mid]
return round((amounts[mid - 1] + amounts[mid]) / 2, 2)
def _build_default_job_tags(self) -> list[str]:
counter = Counter(
tag
for item in self.sample_jobs
for tag in item.get("tags", [])
if isinstance(tag, str) and tag.strip()
)
top_tags = [tag for tag, _ in counter.most_common(3)]
return top_tags or ["有经验优先"]
def _build_default_worker_skills(self) -> list[str]:
counter = Counter(
skill.get("name")
for item in self.sample_workers
for skill in item.get("skills", [])
if isinstance(skill, dict) and isinstance(skill.get("name"), str) and skill.get("name")
)
top_skills = [name for name, _ in counter.most_common(4)]
return top_skills or ["活动执行", "引导", "登记"]
def _build_default_experience_tags(self) -> list[str]:
counter = Counter(
tag
for item in self.sample_workers
for tag in item.get("experience_tags", [])
if isinstance(tag, str) and tag.strip()
)
top_tags = [tag for tag, _ in counter.most_common(5)]
return top_tags or ["活动执行"]
def _build_category_skill_defaults(self) -> dict[str, list[str]]:
category_skills: dict[str, Counter] = {}
for item in self.sample_jobs:
category = item.get("category")
if not isinstance(category, str) or not category:
continue
counter = category_skills.setdefault(category, Counter())
for skill in item.get("skills", []):
if isinstance(skill, str) and skill:
counter[skill] += 1
return {category: [name for name, _ in counter.most_common(4)] for category, counter in category_skills.items()}
def _build_city_region_defaults(self) -> dict[str, str]:
counter: dict[str, Counter] = {}
for item in self.regions:
city = item.get("city")
region = item.get("region")
if not city or not region:
continue
counter.setdefault(city, Counter())[region] += 1
for item in self.sample_jobs:
city = item.get("city")
region = item.get("region")
if city and region:
counter.setdefault(city, Counter())[region] += 3
defaults: dict[str, str] = {}
for city, regions in counter.items():
defaults[city] = regions.most_common(1)[0][0]
return defaults
def _build_tag_candidates(self) -> list[str]:
sample_tags = list(
dict.fromkeys(
tag
for item in self.sample_jobs
for tag in item.get("tags", [])
if isinstance(tag, str) and tag.strip()
)
)
baseline_tags = ["女生优先", "男生优先", "有经验优先", "沟通好", "可连做优先"]
merged = list(dict.fromkeys([*sample_tags, *baseline_tags]))
return merged[:30]

View File

@@ -1,16 +1,76 @@
import { useState } from "react"; import { useEffect, useState } from "react";
import { api } from "../api/client"; import { api } from "../api/client";
import { JsonPanel } from "../components/JsonPanel"; import { JsonPanel } from "../components/JsonPanel";
import { MatchList } from "../components/MatchList"; import { MatchList } from "../components/MatchList";
const DEFAULT_TEXT = "明天下午南山会展中心需要2个签到协助5小时150/人,女生优先,需要会签到、引导和登记。"; const FALLBACK_TEXT = "明天下午南山会展中心需要2个签到协助5小时150/人,女生优先,需要会签到、引导和登记。";
function pickRandom<T>(items: T[]): T {
return items[Math.floor(Math.random() * items.length)];
}
function asString(value: unknown): string {
return typeof value === "string" ? value.trim() : "";
}
function asNumber(value: unknown): number | null {
return typeof value === "number" && Number.isFinite(value) ? value : null;
}
function asStringArray(value: unknown): string[] {
if (!Array.isArray(value)) {
return [];
}
return value.filter((item): item is string => typeof item === "string" && item.trim().length > 0);
}
function buildAdaptiveJobText(items: Record<string, unknown>[]): string {
if (!items.length) {
return FALLBACK_TEXT;
}
const source = pickRandom(items);
const title = asString(source.title) || asString(source.category) || "活动兼职";
const city = asString(source.city) || "深圳";
const region = asString(source.region) || "南山";
const headcount = asNumber(source.headcount) ?? 2;
const duration = asNumber(source.duration_hours) ?? 4;
const location = asString(source.location_detail) || `${city}${region}待定点位`;
const skills = asStringArray(source.skills).slice(0, 3);
const tags = asStringArray(source.tags).slice(0, 2);
const salary = (source.salary as Record<string, unknown> | undefined) ?? {};
const amount = asNumber(salary.amount) ?? 150;
const skillText = skills.length ? `需要会${skills.join("、")}` : "有相关经验优先";
const tagText = tags.length ? `${tags.join("")}` : "";
return `明天下午${location}需要${headcount}${title}${duration}小时,${amount}/人${tagText}${skillText}`;
}
export function JobPage() { export function JobPage() {
const [text, setText] = useState(DEFAULT_TEXT); const [text, setText] = useState("");
const [jobCard, setJobCard] = useState<unknown>(null); const [jobCard, setJobCard] = useState<unknown>(null);
const [matches, setMatches] = useState<any[]>([]); const [matches, setMatches] = useState<any[]>([]);
const [loading, setLoading] = useState(false); const [loading, setLoading] = useState(false);
useEffect(() => {
let active = true;
void (async () => {
try {
const result = await api.jobs();
if (!active) {
return;
}
setText((current) => current || buildAdaptiveJobText(result.items));
} catch {
if (!active) {
return;
}
setText((current) => current || FALLBACK_TEXT);
}
})();
return () => {
active = false;
};
}, []);
const handleExtract = async () => { const handleExtract = async () => {
setLoading(true); setLoading(true);
try { try {

View File

@@ -1,16 +1,73 @@
import { useState } from "react"; import { useEffect, useState } from "react";
import { api } from "../api/client"; import { api } from "../api/client";
import { JsonPanel } from "../components/JsonPanel"; import { JsonPanel } from "../components/JsonPanel";
import { MatchList } from "../components/MatchList"; import { MatchList } from "../components/MatchList";
const DEFAULT_TEXT = "我做过商场促销和活动签到,也能做登记和引导,周末都能接,福田南山都方便。"; const FALLBACK_TEXT = "我做过商场促销和活动签到,也能做登记和引导,周末都能接,福田南山都方便。";
function pickRandom<T>(items: T[]): T {
return items[Math.floor(Math.random() * items.length)];
}
function asString(value: unknown): string {
return typeof value === "string" ? value.trim() : "";
}
function asStringArray(value: unknown): string[] {
if (!Array.isArray(value)) {
return [];
}
return value.filter((item): item is string => typeof item === "string" && item.trim().length > 0);
}
function buildAdaptiveWorkerText(items: Record<string, unknown>[]): string {
if (!items.length) {
return FALLBACK_TEXT;
}
const source = pickRandom(items);
const name = asString(source.name) || "我";
const regions = asStringArray(source.regions).slice(0, 2);
const experiences = asStringArray(source.experience_tags).slice(0, 2);
const skillObjects = Array.isArray(source.skills) ? source.skills : [];
const skills = skillObjects
.map((item) => (item && typeof item === "object" ? asString((item as Record<string, unknown>).name) : ""))
.filter(Boolean)
.slice(0, 3);
const availability = asStringArray(source.availability);
const expText = experiences.length ? experiences.join("和") : "活动执行";
const skillText = skills.length ? skills.join("、") : "沟通和执行";
const regionText = regions.length ? `${regions.join("、")}都方便` : "同城都方便";
const timeText = availability.some((item) => item.includes("weekend")) ? "周末都能接" : "时间比较灵活";
return `${name}做过${expText},也能做${skillText}${timeText}${regionText}`;
}
export function WorkerPage() { export function WorkerPage() {
const [text, setText] = useState(DEFAULT_TEXT); const [text, setText] = useState("");
const [workerCard, setWorkerCard] = useState<unknown>(null); const [workerCard, setWorkerCard] = useState<unknown>(null);
const [matches, setMatches] = useState<any[]>([]); const [matches, setMatches] = useState<any[]>([]);
const [loading, setLoading] = useState(false); const [loading, setLoading] = useState(false);
useEffect(() => {
let active = true;
void (async () => {
try {
const result = await api.workers();
if (!active) {
return;
}
setText((current) => current || buildAdaptiveWorkerText(result.items));
} catch {
if (!active) {
return;
}
setText((current) => current || FALLBACK_TEXT);
}
})();
return () => {
active = false;
};
}, []);
const handleExtract = async () => { const handleExtract = async () => {
setLoading(true); setLoading(true);
try { try {