Facility: 028688

U-Haul Hardin MT

Facility Information disabled
Facility ID
028688
Name
U-Haul Hardin MT
URL
https://www.uhaul.com/Locations/Self-Storage-near-Hardin-MT-59034/1021921/
Address
N/A
Platform
table_layout
Parser File
src/parsers/table_parser.py
Last Scraped
Never
Created
2026-03-06 23:45:35.865957
Updated
2026-03-20 23:23:10.660695
Parser & Healing Diagnosis working
Parser Status
✓ Working
Status Reason
Auto-classified via platform detection: table_layout
Last Healing Attempt
Not attempted
Parser Source (src/parsers/table_parser.py)
"""Generic fallback parser for table-based storage unit listings."""

from __future__ import annotations

import re

from bs4 import BeautifulSoup, Tag

from src.parsers.base import BaseParser, ParseResult, UnitResult

# Column header patterns used to identify relevant tables and map columns.
_SIZE_PATTERNS = re.compile(r"size|dimension|unit\s*type|width.*length", re.IGNORECASE)
_PRICE_PATTERNS = re.compile(r"price|rate|cost|rent|monthly", re.IGNORECASE)
_PROMO_PATTERNS = re.compile(r"promo|special|discount|sale|web", re.IGNORECASE)
_AVAIL_PATTERNS = re.compile(r"avail|status|vacancy", re.IGNORECASE)
_FEATURE_PATTERNS = re.compile(r"feature|amenity|type|detail|description", re.IGNORECASE)


class TableParser(BaseParser):
    """Extract storage units from HTML tables.

    This is the catch-all fallback parser. It scans the page for ``<table>``
    elements whose headers suggest they contain unit listings (size/price columns),
    then maps rows to :class:`UnitResult` objects.
    """

    platform = "table_layout"

    def parse(self, html: str, url: str = "") -> ParseResult:
        soup = BeautifulSoup(html, "lxml")
        result = ParseResult(platform=self.platform, parser_name=self.__class__.__name__)

        tables = soup.find_all("table")
        if not tables:
            result.warnings.append("No HTML tables found on page")
            return result

        for table in tables:
            headers = self._extract_headers(table)
            if not headers:
                continue

            col_map = self._map_columns(headers)
            if not col_map.get("size") and not col_map.get("price"):
                # This table does not look like a unit listing.
                continue

            rows = table.select("tbody tr") or table.select("tr")[1:]  # skip header row
            for row in rows:
                cells = row.find_all(["td", "th"])
                if not cells:
                    continue
                unit = self._row_to_unit(cells, col_map)
                if unit is not None:
                    result.units.append(unit)

        if not result.units:
            result.warnings.append("Tables found but none appeared to contain unit listings")

        return result

    # ------------------------------------------------------------------
    # Internal helpers
    # ------------------------------------------------------------------

    @staticmethod
    def _extract_headers(table: Tag) -> list[str]:
        """Get header text from the first ``<thead>`` row or the first ``<tr>``."""
        thead = table.find("thead")
        if thead:
            return [th.get_text(strip=True) for th in thead.find_all(["th", "td"])]

        first_row = table.find("tr")
        if first_row:
            ths = first_row.find_all("th")
            if ths:
                return [th.get_text(strip=True) for th in ths]

        return []

    @staticmethod
    def _map_columns(headers: list[str]) -> dict[str, int]:
        """Map semantic column names to header indices."""
        col_map: dict[str, int] = {}
        for idx, header in enumerate(headers):
            if _SIZE_PATTERNS.search(header) and "size" not in col_map:
                col_map["size"] = idx
            elif _PROMO_PATTERNS.search(header) and "promo" not in col_map:
                col_map["promo"] = idx
            elif _PRICE_PATTERNS.search(header) and "price" not in col_map:
                col_map["price"] = idx
            elif _AVAIL_PATTERNS.search(header) and "avail" not in col_map:
                col_map["avail"] = idx
            elif _FEATURE_PATTERNS.search(header) and "features" not in col_map:
                col_map["features"] = idx
        return col_map

    def _row_to_unit(self, cells: list[Tag], col_map: dict[str, int]) -> UnitResult | None:
        """Convert a table row into a UnitResult using the column mapping."""
        texts = [c.get_text(strip=True) for c in cells]
        if all(t == "" for t in texts):
            return None

        unit = UnitResult()
        unit.description = " | ".join(texts)

        # Size
        size_idx = col_map.get("size")
        if size_idx is not None and size_idx < len(texts):
            size_text = texts[size_idx]
            w, ln, sq = self.normalize_size(size_text)
            if w is not None:
                meta = unit.metadata or {}
                meta["width"] = w
                meta["length"] = ln
                meta["sqft"] = sq
                unit.metadata = meta
            unit.size = size_text

        # Price (street rate)
        price_idx = col_map.get("price")
        if price_idx is not None and price_idx < len(texts):
            unit.price = self.normalize_price(texts[price_idx])

        # Promo — store as raw text in promotion field
        promo_idx = col_map.get("promo")
        if promo_idx is not None and promo_idx < len(texts):
            promo_text = texts[promo_idx].strip()
            if promo_text:
                unit.promotion = promo_text

        # Availability
        avail_idx = col_map.get("avail")
        if avail_idx is not None and avail_idx < len(texts):
            unit.scarcity = texts[avail_idx]

        # Features / amenities
        features_idx = col_map.get("features")
        feature_text = ""
        if features_idx is not None and features_idx < len(texts):
            feature_text = texts[features_idx].lower()

        # Also check full row text for amenity keywords
        full_text = " ".join(texts).lower()
        combined = f"{feature_text} {full_text}"

        meta = unit.metadata or {}
        if any(kw in combined for kw in ["climate", "temperature", "heated", "cooled"]):
            meta["climateControlled"] = True
        if any(kw in combined for kw in ["drive-up", "drive up", "driveup"]):
            meta["driveUpAccess"] = True
        if "elevator" in combined:
            meta["elevatorAccess"] = True
        if any(kw in combined for kw in ["ground floor", "ground-floor", "1st floor", "first floor"]):
            meta["groundFloor"] = True
        if any(kw in combined for kw in ["indoor", "interior"]):
            meta["indoor"] = True
        if meta:
            unit.metadata = meta

        return unit

Scrape Runs (1)

Run #28 Details

Status
failed
Parser Used
N/A
Platform Detected
N/A
Units Found
0
Stage Reached
fetch
Timestamp
2026-03-09 20:49:31.734085

Failures (1)

fetch DatatypeMismatch unknown unknown permanent

column "success" is of type boolean but expression is of type integer LINE 3: ... VALUES ('028688', 28, '028688_20260309T204935Z.html', 0) ^ HINT: You will need to rewrite or cast the expression.

Stack trace
Traceback (most recent call last):
  File "/app/src/pipeline.py", line 329, in _process_facility
    manifest_id = storage.insert_snapshot_manifest(
                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/app/src/db/pg_backend.py", line 615, in insert_snapshot_manifest
    row = self._execute_returning(
          ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/app/src/db/pg_backend.py", line 54, in _execute_returning
    cur.execute(sql, params)
  File "/app/.venv/lib/python3.11/site-packages/psycopg2/extras.py", line 236, in execute
    return super().execute(query, vars)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
psycopg2.errors.DatatypeMismatch: column "success" is of type boolean but expression is of type integer
LINE 3: ...    VALUES ('028688', 28, '028688_20260309T204935Z.html', 0)
                                                                     ^
HINT:  You will need to rewrite or cast the expression.

All Failures for this Facility (1)

fetch DatatypeMismatch unknown unknown permanent Run #28 | 2026-03-09 20:49:35.899947

column "success" is of type boolean but expression is of type integer LINE 3: ... VALUES ('028688', 28, '028688_20260309T204935Z.html', 0) ^ HINT: You will need to rewrite or cast the expression.

Stack trace
Traceback (most recent call last):
  File "/app/src/pipeline.py", line 329, in _process_facility
    manifest_id = storage.insert_snapshot_manifest(
                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/app/src/db/pg_backend.py", line 615, in insert_snapshot_manifest
    row = self._execute_returning(
          ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/app/src/db/pg_backend.py", line 54, in _execute_returning
    cur.execute(sql, params)
  File "/app/.venv/lib/python3.11/site-packages/psycopg2/extras.py", line 236, in execute
    return super().execute(query, vars)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
psycopg2.errors.DatatypeMismatch: column "success" is of type boolean but expression is of type integer
LINE 3: ...    VALUES ('028688', 28, '028688_20260309T204935Z.html', 0)
                                                                     ^
HINT:  You will need to rewrite or cast the expression.

← Back to dashboard