JobSourceAgent/jobsource/models.py

"""Pydantic data models shared across the pipeline.

RawJob is the normalized output of any job source (Stage 1). JobResult is the
per-job record that flows through the cascade and becomes one CSV row. The CSV
contract is exactly three columns: company_name, career_page_url,
open_position_url.
"""
from __future__ import annotations

from datetime import datetime
from enum import Enum

from pydantic import BaseModel, Field


class JobStatus(str, Enum):
    """Lifecycle of a single job record. Complete == position_found."""

    new = "new"
    website_resolved = "website_resolved"
    careers_found = "careers_found"
    position_found = "position_found"
    failed = "failed"
    needs_review = "needs_review"


class RawJob(BaseModel):
    """Normalized job posting from a source provider (Stage 1 output)."""

    job_id: str = Field(..., description="LinkedIn numeric jobPostingId, parsed from the job URL.")
    company: str = Field(..., description="Company name as reported by the source.")
    linkedin_url: str = Field(..., description="Canonical LinkedIn job-view URL.")
    website: str | None = Field(default=None, description="Company's own site, if provided.")
    listed_at: datetime | None = Field(default=None, description="When the job was posted, if known.")
    title: str | None = Field(default=None, description="Job title, if provided.")
    location: str | None = Field(default=None, description="Job location, if provided.")


class JobResult(BaseModel):
    """Per-job record carried through the cascade; serializes to one CSV row."""

    job_id: str
    company_name: str
    company_key: str | None = Field(
        default=None, description="Normalized domain, else lowercased name."
    )
    website: str | None = None
    career_page_url: str | None = None
    open_position_url: str | None = None
    status: JobStatus = JobStatus.new
    linkedin_url: str | None = None
    listed_at: datetime | None = None
    title: str | None = None
    location: str | None = None
    # Observability: which cascade tier/method resolved each stage.
    careers_method: str | None = None
    position_method: str | None = None

    @property
    def is_complete(self) -> bool:
        """A record is complete once an open position has been found."""
        return self.status == JobStatus.position_found

    @classmethod
    def from_raw(cls, raw: RawJob) -> "JobResult":
        """Seed a result from a raw job (status starts at `new`)."""
        return cls(
            job_id=raw.job_id,
            company_name=raw.company,
            website=raw.website,
            linkedin_url=raw.linkedin_url,
            listed_at=raw.listed_at,
            title=raw.title,
            location=raw.location,
            status=JobStatus.new,
        )

    def to_csv_row(self) -> dict[str, str]:
        """Return exactly the three contract columns (empty string for None)."""
        return {
            "company_name": self.company_name or "",
            "career_page_url": self.career_page_url or "",
            "open_position_url": self.open_position_url or "",
        }


# The CSV output contract — exactly these columns, in this order.
CSV_COLUMNS: tuple[str, str, str] = ("company_name", "career_page_url", "open_position_url")