phase1-ingest-resolve
This commit is contained in:
@@ -1,16 +1,62 @@
|
||||
"""JobSource interface: every ingestion provider must implement fetch_recent_jobs().
|
||||
"""JobSource interface and shared LinkedIn URL helpers.
|
||||
|
||||
Scaffold stub -- not implemented yet.
|
||||
Every ingestion provider implements JobSource. The helpers here are
|
||||
shared by all providers and have no external dependencies.
|
||||
"""
|
||||
# TODO (Stage 1): define the JobSource ABC per CLAUDE.md "Stage 1 — Ingest (deterministic)".
|
||||
# Interface:
|
||||
# class JobSource(ABC):
|
||||
# @abstractmethod
|
||||
# def fetch_recent_jobs(
|
||||
# self,
|
||||
# search_terms: list[str],
|
||||
# location: str,
|
||||
# hours_old: int,
|
||||
# results_wanted: int,
|
||||
# ) -> list[RawJob]: ...
|
||||
# Implementations: jobspy_source.JobSpySource, apify_source.ApifySource.
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from abc import ABC, abstractmethod
|
||||
|
||||
from ..models import RawJob
|
||||
|
||||
_LINKEDIN_JOB_URL_RE = re.compile(r"/jobs/view/(\d+)")
|
||||
|
||||
|
||||
class JobSource(ABC):
|
||||
"""Abstract base for all ingestion providers."""
|
||||
|
||||
@abstractmethod
|
||||
def fetch_recent_jobs(
|
||||
self,
|
||||
search_terms: list[str],
|
||||
location: str,
|
||||
hours_old: int,
|
||||
results_wanted: int,
|
||||
) -> list[RawJob]:
|
||||
"""Return recent jobs matching search_terms in location.
|
||||
|
||||
Never raises — callers expect a list (possibly empty) on any error.
|
||||
"""
|
||||
|
||||
|
||||
def parse_linkedin_job_id(url: str | None) -> str | None:
|
||||
"""Extract the numeric jobPostingId from a LinkedIn job-view URL.
|
||||
|
||||
Returns None for None input or any URL that doesn't contain /jobs/view/{id}.
|
||||
Tracking query params are ignored (we only look at the path segment).
|
||||
"""
|
||||
if not url:
|
||||
return None
|
||||
m = _LINKEDIN_JOB_URL_RE.search(url)
|
||||
return m.group(1) if m else None
|
||||
|
||||
|
||||
def canonical_linkedin_url(job_id: str) -> str:
|
||||
"""Return the clean LinkedIn job URL with no tracking params."""
|
||||
return f"https://www.linkedin.com/jobs/view/{job_id}"
|
||||
|
||||
|
||||
def clean_value(value: object) -> str | None:
|
||||
"""Normalize a source cell to str | None.
|
||||
|
||||
Treats None, empty/whitespace strings, and float NaN (pandas sentinel)
|
||||
as None — without importing pandas.
|
||||
"""
|
||||
if value is None:
|
||||
return None
|
||||
# float NaN check without pandas: NaN is the only float where v != v
|
||||
if isinstance(value, float) and value != value:
|
||||
return None
|
||||
s = str(value).strip()
|
||||
return s if s else None
|
||||
|
||||
Reference in New Issue
Block a user