63 lines
1.8 KiB
Python
63 lines
1.8 KiB
Python
"""JobSource interface and shared LinkedIn URL helpers.
|
|
|
|
Every ingestion provider implements JobSource. The helpers here are
|
|
shared by all providers and have no external dependencies.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import re
|
|
from abc import ABC, abstractmethod
|
|
|
|
from ..models import RawJob
|
|
|
|
_LINKEDIN_JOB_URL_RE = re.compile(r"/jobs/view/(\d+)")
|
|
|
|
|
|
class JobSource(ABC):
|
|
"""Abstract base for all ingestion providers."""
|
|
|
|
@abstractmethod
|
|
def fetch_recent_jobs(
|
|
self,
|
|
search_terms: list[str],
|
|
location: str,
|
|
hours_old: int,
|
|
results_wanted: int,
|
|
) -> list[RawJob]:
|
|
"""Return recent jobs matching search_terms in location.
|
|
|
|
Never raises — callers expect a list (possibly empty) on any error.
|
|
"""
|
|
|
|
|
|
def parse_linkedin_job_id(url: str | None) -> str | None:
|
|
"""Extract the numeric jobPostingId from a LinkedIn job-view URL.
|
|
|
|
Returns None for None input or any URL that doesn't contain /jobs/view/{id}.
|
|
Tracking query params are ignored (we only look at the path segment).
|
|
"""
|
|
if not url:
|
|
return None
|
|
m = _LINKEDIN_JOB_URL_RE.search(url)
|
|
return m.group(1) if m else None
|
|
|
|
|
|
def canonical_linkedin_url(job_id: str) -> str:
|
|
"""Return the clean LinkedIn job URL with no tracking params."""
|
|
return f"https://www.linkedin.com/jobs/view/{job_id}"
|
|
|
|
|
|
def clean_value(value: object) -> str | None:
|
|
"""Normalize a source cell to str | None.
|
|
|
|
Treats None, empty/whitespace strings, and float NaN (pandas sentinel)
|
|
as None — without importing pandas.
|
|
"""
|
|
if value is None:
|
|
return None
|
|
# float NaN check without pandas: NaN is the only float where v != v
|
|
if isinstance(value, float) and value != value:
|
|
return None
|
|
s = str(value).strip()
|
|
return s if s else None
|