phase1-ingest-resolve

This commit is contained in:
ldy
2026-06-17 13:59:00 -04:00
parent f13b8fc1ca
commit cd9ab9b95e
8 changed files with 958 additions and 35 deletions

View File

@@ -1,16 +1,62 @@
"""JobSource interface: every ingestion provider must implement fetch_recent_jobs().
"""JobSource interface and shared LinkedIn URL helpers.
Scaffold stub -- not implemented yet.
Every ingestion provider implements JobSource. The helpers here are
shared by all providers and have no external dependencies.
"""
# TODO (Stage 1): define the JobSource ABC per CLAUDE.md "Stage 1 — Ingest (deterministic)".
# Interface:
# class JobSource(ABC):
# @abstractmethod
# def fetch_recent_jobs(
# self,
# search_terms: list[str],
# location: str,
# hours_old: int,
# results_wanted: int,
# ) -> list[RawJob]: ...
# Implementations: jobspy_source.JobSpySource, apify_source.ApifySource.
from __future__ import annotations
import re
from abc import ABC, abstractmethod
from ..models import RawJob
_LINKEDIN_JOB_URL_RE = re.compile(r"/jobs/view/(\d+)")
class JobSource(ABC):
"""Abstract base for all ingestion providers."""
@abstractmethod
def fetch_recent_jobs(
self,
search_terms: list[str],
location: str,
hours_old: int,
results_wanted: int,
) -> list[RawJob]:
"""Return recent jobs matching search_terms in location.
Never raises — callers expect a list (possibly empty) on any error.
"""
def parse_linkedin_job_id(url: str | None) -> str | None:
"""Extract the numeric jobPostingId from a LinkedIn job-view URL.
Returns None for None input or any URL that doesn't contain /jobs/view/{id}.
Tracking query params are ignored (we only look at the path segment).
"""
if not url:
return None
m = _LINKEDIN_JOB_URL_RE.search(url)
return m.group(1) if m else None
def canonical_linkedin_url(job_id: str) -> str:
"""Return the clean LinkedIn job URL with no tracking params."""
return f"https://www.linkedin.com/jobs/view/{job_id}"
def clean_value(value: object) -> str | None:
"""Normalize a source cell to str | None.
Treats None, empty/whitespace strings, and float NaN (pandas sentinel)
as None — without importing pandas.
"""
if value is None:
return None
# float NaN check without pandas: NaN is the only float where v != v
if isinstance(value, float) and value != value:
return None
s = str(value).strip()
return s if s else None