Files
JobSourceAgent/jobsource/sources/base.py
2026-06-17 13:59:00 -04:00

63 lines
1.8 KiB
Python

"""JobSource interface and shared LinkedIn URL helpers.
Every ingestion provider implements JobSource. The helpers here are
shared by all providers and have no external dependencies.
"""
from __future__ import annotations
import re
from abc import ABC, abstractmethod
from ..models import RawJob
_LINKEDIN_JOB_URL_RE = re.compile(r"/jobs/view/(\d+)")
class JobSource(ABC):
"""Abstract base for all ingestion providers."""
@abstractmethod
def fetch_recent_jobs(
self,
search_terms: list[str],
location: str,
hours_old: int,
results_wanted: int,
) -> list[RawJob]:
"""Return recent jobs matching search_terms in location.
Never raises — callers expect a list (possibly empty) on any error.
"""
def parse_linkedin_job_id(url: str | None) -> str | None:
"""Extract the numeric jobPostingId from a LinkedIn job-view URL.
Returns None for None input or any URL that doesn't contain /jobs/view/{id}.
Tracking query params are ignored (we only look at the path segment).
"""
if not url:
return None
m = _LINKEDIN_JOB_URL_RE.search(url)
return m.group(1) if m else None
def canonical_linkedin_url(job_id: str) -> str:
"""Return the clean LinkedIn job URL with no tracking params."""
return f"https://www.linkedin.com/jobs/view/{job_id}"
def clean_value(value: object) -> str | None:
"""Normalize a source cell to str | None.
Treats None, empty/whitespace strings, and float NaN (pandas sentinel)
as None — without importing pandas.
"""
if value is None:
return None
# float NaN check without pandas: NaN is the only float where v != v
if isinstance(value, float) and value != value:
return None
s = str(value).strip()
return s if s else None