Files
JobSourceAgent/jobsource/resolve.py
2026-06-17 13:59:00 -04:00

123 lines
4.1 KiB
Python

"""Resolve company name → company website URL (Stage 1b, deterministic).
Three-tier cascade — returns on first hit:
Tier 1: provider-supplied website (trusted, no network call).
Tier 2: verified {slug}.com guess (HTTP HEAD/GET probe).
Tier 3: search API (gated by SEARCH_API_ENABLED; ships as a stub — wire
a real provider in _search_api_lookup() when ready).
"""
from __future__ import annotations
import logging
import re
import httpx
from .config import get_settings
from .http import build_client, request_with_retries
logger = logging.getLogger(__name__)
# Legal suffix words stripped when building the domain slug.
_LEGAL_SUFFIXES = re.compile(
r"\b(inc|llc|ltd|corp|co|gmbh|plc|sa|ag|pbc|lp|llp)\b",
re.IGNORECASE,
)
_NON_ALNUM = re.compile(r"[^a-z0-9]+")
# HEAD responses that indicate the server doesn't support HEAD — retry with GET.
_HEAD_NOT_SUPPORTED = frozenset({405, 501})
def resolve_website(
company_name: str,
website: str | None = None,
*,
client: httpx.Client | None = None,
) -> str | None:
"""Return the company's own website URL, or None if unresolvable.
Pass an existing httpx.Client to reuse connections across many calls;
otherwise a short-lived client is created and closed here.
"""
settings = get_settings()
_managed = client is None
if _managed:
client = build_client()
try:
# Tier 1 — provider-supplied website (trusted, no network needed).
if website and not website.startswith("PLACEHOLDER"):
resolved = _normalize_scheme(website)
logger.info("resolve(%s): tier=provider url=%s", company_name, resolved)
return resolved
# Tier 2 — {slug}.com guess with HTTP verification.
slug = _slug(company_name)
if slug:
guessed = f"https://{slug}.com"
verified = _verify(client, guessed)
if verified:
logger.info("resolve(%s): tier=slug_guess url=%s", company_name, verified)
return verified
# Tier 3 — optional search API (gated; stub by default).
if settings.search_api_enabled and not settings.search_api_key.startswith("PLACEHOLDER"):
result = _search_api_lookup(company_name, client)
if result:
logger.info("resolve(%s): tier=search_api url=%s", company_name, result)
return result
logger.info("resolve(%s): unresolvable (all tiers missed)", company_name)
return None
finally:
if _managed:
client.close()
# ---------------------------------------------------------------------------
# Internal helpers
# ---------------------------------------------------------------------------
def _normalize_scheme(url: str) -> str:
"""Add https:// if the URL has no scheme."""
if url.startswith(("http://", "https://")):
return url
return f"https://{url}"
def _slug(name: str) -> str | None:
"""Normalize company name to a domain slug (lowercase, no legal suffixes, alnum only)."""
s = name.lower()
s = _LEGAL_SUFFIXES.sub("", s)
s = _NON_ALNUM.sub("", s)
return s or None
def _verify(client: httpx.Client, url: str) -> str | None:
"""Probe url with HEAD (fallback to GET on 405/501); return final URL or None."""
try:
resp = request_with_retries(client, "HEAD", url, max_retries=1)
if resp.status_code in _HEAD_NOT_SUPPORTED:
resp = request_with_retries(client, "GET", url, max_retries=1)
if resp.status_code < 400:
return str(resp.url)
return None
except Exception:
return None
def _search_api_lookup(company_name: str, client: httpx.Client) -> str | None:
"""Search API fallback — returns the company's website from a web search.
Integration point: implement by querying a search API with
f'"{company_name}" official website' using settings.search_api_key,
extracting the registrable domain from the top organic result's URL,
verifying it with _verify(), and returning the URL or None.
Currently a stub; enabled only when SEARCH_API_ENABLED=true and a real
SEARCH_API_KEY is set.
"""
return None