123 lines
4.1 KiB
Python
123 lines
4.1 KiB
Python
"""Resolve company name → company website URL (Stage 1b, deterministic).
|
|
|
|
Three-tier cascade — returns on first hit:
|
|
Tier 1: provider-supplied website (trusted, no network call).
|
|
Tier 2: verified {slug}.com guess (HTTP HEAD/GET probe).
|
|
Tier 3: search API (gated by SEARCH_API_ENABLED; ships as a stub — wire
|
|
a real provider in _search_api_lookup() when ready).
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
import re
|
|
|
|
import httpx
|
|
|
|
from .config import get_settings
|
|
from .http import build_client, request_with_retries
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Legal suffix words stripped when building the domain slug.
|
|
_LEGAL_SUFFIXES = re.compile(
|
|
r"\b(inc|llc|ltd|corp|co|gmbh|plc|sa|ag|pbc|lp|llp)\b",
|
|
re.IGNORECASE,
|
|
)
|
|
_NON_ALNUM = re.compile(r"[^a-z0-9]+")
|
|
|
|
# HEAD responses that indicate the server doesn't support HEAD — retry with GET.
|
|
_HEAD_NOT_SUPPORTED = frozenset({405, 501})
|
|
|
|
|
|
def resolve_website(
|
|
company_name: str,
|
|
website: str | None = None,
|
|
*,
|
|
client: httpx.Client | None = None,
|
|
) -> str | None:
|
|
"""Return the company's own website URL, or None if unresolvable.
|
|
|
|
Pass an existing httpx.Client to reuse connections across many calls;
|
|
otherwise a short-lived client is created and closed here.
|
|
"""
|
|
settings = get_settings()
|
|
_managed = client is None
|
|
if _managed:
|
|
client = build_client()
|
|
|
|
try:
|
|
# Tier 1 — provider-supplied website (trusted, no network needed).
|
|
if website and not website.startswith("PLACEHOLDER"):
|
|
resolved = _normalize_scheme(website)
|
|
logger.info("resolve(%s): tier=provider url=%s", company_name, resolved)
|
|
return resolved
|
|
|
|
# Tier 2 — {slug}.com guess with HTTP verification.
|
|
slug = _slug(company_name)
|
|
if slug:
|
|
guessed = f"https://{slug}.com"
|
|
verified = _verify(client, guessed)
|
|
if verified:
|
|
logger.info("resolve(%s): tier=slug_guess url=%s", company_name, verified)
|
|
return verified
|
|
|
|
# Tier 3 — optional search API (gated; stub by default).
|
|
if settings.search_api_enabled and not settings.search_api_key.startswith("PLACEHOLDER"):
|
|
result = _search_api_lookup(company_name, client)
|
|
if result:
|
|
logger.info("resolve(%s): tier=search_api url=%s", company_name, result)
|
|
return result
|
|
|
|
logger.info("resolve(%s): unresolvable (all tiers missed)", company_name)
|
|
return None
|
|
finally:
|
|
if _managed:
|
|
client.close()
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Internal helpers
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def _normalize_scheme(url: str) -> str:
|
|
"""Add https:// if the URL has no scheme."""
|
|
if url.startswith(("http://", "https://")):
|
|
return url
|
|
return f"https://{url}"
|
|
|
|
|
|
def _slug(name: str) -> str | None:
|
|
"""Normalize company name to a domain slug (lowercase, no legal suffixes, alnum only)."""
|
|
s = name.lower()
|
|
s = _LEGAL_SUFFIXES.sub("", s)
|
|
s = _NON_ALNUM.sub("", s)
|
|
return s or None
|
|
|
|
|
|
def _verify(client: httpx.Client, url: str) -> str | None:
|
|
"""Probe url with HEAD (fallback to GET on 405/501); return final URL or None."""
|
|
try:
|
|
resp = request_with_retries(client, "HEAD", url, max_retries=1)
|
|
if resp.status_code in _HEAD_NOT_SUPPORTED:
|
|
resp = request_with_retries(client, "GET", url, max_retries=1)
|
|
if resp.status_code < 400:
|
|
return str(resp.url)
|
|
return None
|
|
except Exception:
|
|
return None
|
|
|
|
|
|
def _search_api_lookup(company_name: str, client: httpx.Client) -> str | None:
|
|
"""Search API fallback — returns the company's website from a web search.
|
|
|
|
Integration point: implement by querying a search API with
|
|
f'"{company_name}" official website' using settings.search_api_key,
|
|
extracting the registrable domain from the top organic result's URL,
|
|
verifying it with _verify(), and returning the URL or None.
|
|
|
|
Currently a stub; enabled only when SEARCH_API_ENABLED=true and a real
|
|
SEARCH_API_KEY is set.
|
|
"""
|
|
return None
|