feat: add new folder
This commit is contained in:
209
gig-poc/apps/api/app/services/extraction_service.py
Normal file
209
gig-poc/apps/api/app/services/extraction_service.py
Normal file
@@ -0,0 +1,209 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import re
|
||||
from datetime import datetime, timedelta, timezone
|
||||
from pathlib import Path
|
||||
|
||||
from dateutil import parser as date_parser
|
||||
from pydantic import ValidationError
|
||||
|
||||
from app.core.config import get_settings
|
||||
from app.core.logging import logger
|
||||
from app.domain.schemas import ExtractResponse, JobCard, Salary, SkillScore, WorkerCard
|
||||
from app.services.llm_client import LLMClient
|
||||
from app.utils.ids import generate_id
|
||||
from app.utils.prompts import load_prompt
|
||||
|
||||
|
||||
class ExtractionService:
|
||||
def __init__(self) -> None:
|
||||
self.settings = get_settings()
|
||||
self.skills = json.loads((self.settings.sample_data_dir / "skills.json").read_text(encoding="utf-8"))
|
||||
self.categories = json.loads((self.settings.sample_data_dir / "categories.json").read_text(encoding="utf-8"))
|
||||
self.regions = json.loads((self.settings.sample_data_dir / "regions.json").read_text(encoding="utf-8"))
|
||||
self.llm_client = LLMClient(self.settings)
|
||||
|
||||
def extract_job(self, text: str) -> ExtractResponse:
|
||||
logger.info("extract_job request text=%s", text)
|
||||
llm_result = self._llm_extract(text, self.settings.prompt_dir / "job_extract.md")
|
||||
if llm_result:
|
||||
try:
|
||||
return ExtractResponse(success=True, data=JobCard(**llm_result.content))
|
||||
except ValidationError as exc:
|
||||
logger.exception("LLM job extraction validation failed")
|
||||
return ExtractResponse(success=False, errors=[str(exc)], missing_fields=self._missing_fields(exc))
|
||||
|
||||
try:
|
||||
card = self._extract_job_rule(text)
|
||||
return ExtractResponse(success=True, data=card)
|
||||
except ValidationError as exc:
|
||||
logger.exception("Rule job extraction validation failed")
|
||||
return ExtractResponse(success=False, errors=[str(exc)], missing_fields=self._missing_fields(exc))
|
||||
|
||||
def extract_worker(self, text: str) -> ExtractResponse:
|
||||
logger.info("extract_worker request text=%s", text)
|
||||
llm_result = self._llm_extract(text, self.settings.prompt_dir / "worker_extract.md")
|
||||
if llm_result:
|
||||
try:
|
||||
return ExtractResponse(success=True, data=WorkerCard(**llm_result.content))
|
||||
except ValidationError as exc:
|
||||
logger.exception("LLM worker extraction validation failed")
|
||||
return ExtractResponse(success=False, errors=[str(exc)], missing_fields=self._missing_fields(exc))
|
||||
|
||||
try:
|
||||
card = self._extract_worker_rule(text)
|
||||
return ExtractResponse(success=True, data=card)
|
||||
except ValidationError as exc:
|
||||
logger.exception("Rule worker extraction validation failed")
|
||||
return ExtractResponse(success=False, errors=[str(exc)], missing_fields=self._missing_fields(exc))
|
||||
|
||||
def _llm_extract(self, text: str, prompt_path: Path):
|
||||
try:
|
||||
return self.llm_client.extract_json(load_prompt(prompt_path), text)
|
||||
except Exception:
|
||||
logger.exception("LLM extraction failed, fallback to rule-based extraction")
|
||||
return None
|
||||
|
||||
def _extract_job_rule(self, text: str) -> JobCard:
|
||||
skill_hits = [item for item in self.skills if item in text]
|
||||
category = next((item for item in self.categories if item in text), "活动执行")
|
||||
region = self._extract_region(text)
|
||||
salary = self._extract_salary(text)
|
||||
headcount = self._extract_number(text, [r"(\d+)\s*[个名人位]"], default=1)
|
||||
duration = self._extract_number(text, [r"(\d+(?:\.\d+)?)\s*小时"], default=4.0, cast=float)
|
||||
tags = [tag for tag in ["女生优先", "男生优先", "有经验优先", "沟通好", "可连做优先"] if tag in text]
|
||||
title = next((f"{category}{skill_hits[0]}兼职" for _ in [0] if skill_hits), f"{category}兼职")
|
||||
card = JobCard(
|
||||
job_id=generate_id("job"),
|
||||
title=title,
|
||||
category=category,
|
||||
description=text,
|
||||
skills=skill_hits[:5] or self._guess_category_skills(category),
|
||||
city=region["city"],
|
||||
region=region["region"],
|
||||
location_detail=self._extract_location(text, region),
|
||||
start_time=self._extract_job_time(text),
|
||||
duration_hours=duration,
|
||||
headcount=int(headcount),
|
||||
salary=salary,
|
||||
work_mode="排班制" if "排班" in text else "兼职",
|
||||
tags=tags or ["有经验优先"],
|
||||
confidence=self._compute_confidence(skill_hits, region, salary.amount > 0),
|
||||
)
|
||||
return card
|
||||
|
||||
def _extract_worker_rule(self, text: str) -> WorkerCard:
|
||||
skill_hits = [item for item in self.skills if item in text][:6]
|
||||
region_hits = [item for item in self.regions if item["region"] in text or item["city"] in text]
|
||||
city_names = list(dict.fromkeys([item["city"] for item in region_hits])) or ["深圳"]
|
||||
region_names = list(dict.fromkeys([item["region"] for item in region_hits])) or ["南山"]
|
||||
availability = self._extract_availability(text)
|
||||
experience = [item for item in ["商场", "会展", "活动执行", "物流", "零售", "客服中心", "快消", "校园推广"] if item in text]
|
||||
card = WorkerCard(
|
||||
worker_id=generate_id("worker"),
|
||||
name=self._extract_name(text),
|
||||
description=text,
|
||||
skills=[SkillScore(name=item, score=round(0.72 + index * 0.04, 2)) for index, item in enumerate(skill_hits or ["活动执行", "引导", "登记"])],
|
||||
cities=city_names,
|
||||
regions=region_names,
|
||||
availability=availability,
|
||||
experience_tags=experience or ["活动执行"],
|
||||
reliability_score=0.76,
|
||||
profile_completion=0.68,
|
||||
confidence=self._compute_confidence(skill_hits, {"city": city_names[0], "region": region_names[0]}, True),
|
||||
)
|
||||
return card
|
||||
|
||||
def _extract_region(self, text: str) -> dict:
|
||||
for item in self.regions:
|
||||
if item["city"] in text and item["region"] in text:
|
||||
return item
|
||||
for item in self.regions:
|
||||
if item["region"] in text:
|
||||
return item
|
||||
return {"city": "深圳", "region": "南山"}
|
||||
|
||||
def _extract_location(self, text: str, region: dict) -> str:
|
||||
markers = ["会展中心", "商场", "地铁站", "园区", "写字楼", "仓库", "门店"]
|
||||
for marker in markers:
|
||||
if marker in text:
|
||||
return f"{region['city']}{region['region']}{marker}"
|
||||
return f"{region['city']}{region['region']}待定点位"
|
||||
|
||||
def _extract_salary(self, text: str) -> Salary:
|
||||
amount = self._extract_number(text, [r"(\d+(?:\.\d+)?)\s*(?:元|块)"], default=150.0, cast=float)
|
||||
salary_type = "hourly" if "小时" in text and "/小时" in text else "daily"
|
||||
return Salary(type=salary_type, amount=amount, currency="CNY")
|
||||
|
||||
def _extract_number(self, text: str, patterns: list[str], default, cast=int):
|
||||
for pattern in patterns:
|
||||
match = re.search(pattern, text)
|
||||
if match:
|
||||
return cast(match.group(1))
|
||||
return default
|
||||
|
||||
def _extract_job_time(self, text: str) -> datetime:
|
||||
shanghai_tz = timezone(timedelta(hours=8))
|
||||
now = datetime.now(shanghai_tz)
|
||||
if "明天" in text:
|
||||
base = now + timedelta(days=1)
|
||||
elif "后天" in text:
|
||||
base = now + timedelta(days=2)
|
||||
else:
|
||||
month_day = re.search(r"(\d{1,2})月(\d{1,2})日", text)
|
||||
if month_day:
|
||||
month, day = int(month_day.group(1)), int(month_day.group(2))
|
||||
base = now.replace(month=month, day=day)
|
||||
else:
|
||||
base = now + timedelta(days=1)
|
||||
hour = 9
|
||||
if "下午" in text:
|
||||
hour = 13
|
||||
elif "晚上" in text:
|
||||
hour = 19
|
||||
explicit_hour = re.search(r"(\d{1,2})[:点时](\d{0,2})?", text)
|
||||
if explicit_hour:
|
||||
hour = int(explicit_hour.group(1))
|
||||
return base.replace(hour=hour, minute=0, second=0, microsecond=0)
|
||||
|
||||
def _extract_availability(self, text: str) -> list[str]:
|
||||
tags = []
|
||||
if "周末" in text:
|
||||
tags.append("weekend")
|
||||
if "上午" in text:
|
||||
tags.append("weekday_am")
|
||||
if "下午" in text:
|
||||
tags.append("weekday_pm")
|
||||
if "随时" in text or "都能" in text or "全天" in text:
|
||||
tags.append("anytime")
|
||||
return tags or ["anytime"]
|
||||
|
||||
def _extract_name(self, text: str) -> str:
|
||||
if match := re.search(r"我叫([\u4e00-\u9fa5]{2,4})", text):
|
||||
return match.group(1)
|
||||
if match := re.search(r"我是([\u4e00-\u9fa5]{2,4})", text):
|
||||
return match.group(1)
|
||||
return "匿名候选人"
|
||||
|
||||
def _guess_category_skills(self, category: str) -> list[str]:
|
||||
mapping = {
|
||||
"活动执行": ["签到", "引导", "登记"],
|
||||
"促销": ["促销", "导购", "陈列"],
|
||||
"配送": ["配送", "装卸", "司机协助"],
|
||||
"客服": ["客服", "电话邀约", "线上客服"],
|
||||
}
|
||||
return mapping.get(category, ["活动执行", "沟通"])
|
||||
|
||||
def _compute_confidence(self, skill_hits: list[str], region: dict, has_salary: bool) -> float:
|
||||
score = 0.55
|
||||
if skill_hits:
|
||||
score += 0.15
|
||||
if region.get("city"):
|
||||
score += 0.15
|
||||
if has_salary:
|
||||
score += 0.1
|
||||
return min(round(score, 2), 0.95)
|
||||
|
||||
def _missing_fields(self, exc: ValidationError) -> list[str]:
|
||||
return [".".join(str(part) for part in item["loc"]) for item in exc.errors()]
|
||||
Reference in New Issue
Block a user