"""Pydantic data models shared across the pipeline. RawJob is the normalized output of any job source (Stage 1). JobResult is the per-job record that flows through the cascade and becomes one CSV row. The CSV contract is exactly three columns: company_name, career_page_url, open_position_url. """ from __future__ import annotations from datetime import datetime from enum import Enum from pydantic import BaseModel, Field class JobStatus(str, Enum): """Lifecycle of a single job record. Complete == position_found.""" new = "new" website_resolved = "website_resolved" careers_found = "careers_found" position_found = "position_found" failed = "failed" needs_review = "needs_review" class RawJob(BaseModel): """Normalized job posting from a source provider (Stage 1 output).""" job_id: str = Field(..., description="LinkedIn numeric jobPostingId, parsed from the job URL.") company: str = Field(..., description="Company name as reported by the source.") linkedin_url: str = Field(..., description="Canonical LinkedIn job-view URL.") website: str | None = Field(default=None, description="Company's own site, if provided.") listed_at: datetime | None = Field(default=None, description="When the job was posted, if known.") title: str | None = Field(default=None, description="Job title, if provided.") location: str | None = Field(default=None, description="Job location, if provided.") class JobResult(BaseModel): """Per-job record carried through the cascade; serializes to one CSV row.""" job_id: str company_name: str company_key: str | None = Field( default=None, description="Normalized domain, else lowercased name." ) website: str | None = None career_page_url: str | None = None open_position_url: str | None = None status: JobStatus = JobStatus.new linkedin_url: str | None = None listed_at: datetime | None = None title: str | None = None location: str | None = None # Observability: which cascade tier/method resolved each stage. careers_method: str | None = None position_method: str | None = None @property def is_complete(self) -> bool: """A record is complete once an open position has been found.""" return self.status == JobStatus.position_found @classmethod def from_raw(cls, raw: RawJob) -> "JobResult": """Seed a result from a raw job (status starts at `new`).""" return cls( job_id=raw.job_id, company_name=raw.company, website=raw.website, linkedin_url=raw.linkedin_url, listed_at=raw.listed_at, title=raw.title, location=raw.location, status=JobStatus.new, ) def to_csv_row(self) -> dict[str, str]: """Return exactly the three contract columns (empty string for None).""" return { "company_name": self.company_name or "", "career_page_url": self.career_page_url or "", "open_position_url": self.open_position_url or "", } # The CSV output contract — exactly these columns, in this order. CSV_COLUMNS: tuple[str, str, str] = ("company_name", "career_page_url", "open_position_url")