Facility: 009208
Huntley Mini Storage & RV
- Facility ID
- 009208
- Name
- Huntley Mini Storage & RV
- URL
- https://www.huntleyproject.net/huntley-mini-storage-rv/
- Address
- N/A
- Platform
- table_layout
- Parser File
- src/parsers/table_parser.py
- Last Scraped
- 2026-03-27 14:00:43.759409
- Created
- 2026-03-06 23:45:35.865957
- Updated
- 2026-03-27 14:00:43.759409
- Parser Status
- ⚠ Needs Fix
- Status Reason
- Parser returned 0 units
- Last Healing Attempt
- Not attempted
Parser Source (src/parsers/table_parser.py)
"""Generic fallback parser for table-based storage unit listings."""
from __future__ import annotations
import re
from bs4 import BeautifulSoup, Tag
from src.parsers.base import BaseParser, ParseResult, UnitResult
# Column header patterns used to identify relevant tables and map columns.
_SIZE_PATTERNS = re.compile(r"size|dimension|unit\s*type|width.*length", re.IGNORECASE)
_PRICE_PATTERNS = re.compile(r"price|rate|cost|rent|monthly", re.IGNORECASE)
_PROMO_PATTERNS = re.compile(r"promo|special|discount|sale|web", re.IGNORECASE)
_AVAIL_PATTERNS = re.compile(r"avail|status|vacancy", re.IGNORECASE)
_FEATURE_PATTERNS = re.compile(r"feature|amenity|type|detail|description", re.IGNORECASE)
class TableParser(BaseParser):
"""Extract storage units from HTML tables.
This is the catch-all fallback parser. It scans the page for ``<table>``
elements whose headers suggest they contain unit listings (size/price columns),
then maps rows to :class:`UnitResult` objects.
"""
platform = "table_layout"
def parse(self, html: str, url: str = "") -> ParseResult:
soup = BeautifulSoup(html, "lxml")
result = ParseResult(platform=self.platform, parser_name=self.__class__.__name__)
tables = soup.find_all("table")
if not tables:
result.warnings.append("No HTML tables found on page")
return result
for table in tables:
headers = self._extract_headers(table)
if not headers:
continue
col_map = self._map_columns(headers)
if not col_map.get("size") and not col_map.get("price"):
# This table does not look like a unit listing.
continue
rows = table.select("tbody tr") or table.select("tr")[1:] # skip header row
for row in rows:
cells = row.find_all(["td", "th"])
if not cells:
continue
unit = self._row_to_unit(cells, col_map)
if unit is not None:
result.units.append(unit)
if not result.units:
result.warnings.append("Tables found but none appeared to contain unit listings")
return result
# ------------------------------------------------------------------
# Internal helpers
# ------------------------------------------------------------------
@staticmethod
def _extract_headers(table: Tag) -> list[str]:
"""Get header text from the first ``<thead>`` row or the first ``<tr>``."""
thead = table.find("thead")
if thead:
return [th.get_text(strip=True) for th in thead.find_all(["th", "td"])]
first_row = table.find("tr")
if first_row:
ths = first_row.find_all("th")
if ths:
return [th.get_text(strip=True) for th in ths]
return []
@staticmethod
def _map_columns(headers: list[str]) -> dict[str, int]:
"""Map semantic column names to header indices."""
col_map: dict[str, int] = {}
for idx, header in enumerate(headers):
if _SIZE_PATTERNS.search(header) and "size" not in col_map:
col_map["size"] = idx
elif _PROMO_PATTERNS.search(header) and "promo" not in col_map:
col_map["promo"] = idx
elif _PRICE_PATTERNS.search(header) and "price" not in col_map:
col_map["price"] = idx
elif _AVAIL_PATTERNS.search(header) and "avail" not in col_map:
col_map["avail"] = idx
elif _FEATURE_PATTERNS.search(header) and "features" not in col_map:
col_map["features"] = idx
return col_map
def _row_to_unit(self, cells: list[Tag], col_map: dict[str, int]) -> UnitResult | None:
"""Convert a table row into a UnitResult using the column mapping."""
texts = [c.get_text(strip=True) for c in cells]
if all(t == "" for t in texts):
return None
unit = UnitResult()
unit.description = " | ".join(texts)
# Size
size_idx = col_map.get("size")
if size_idx is not None and size_idx < len(texts):
size_text = texts[size_idx]
w, ln, sq = self.normalize_size(size_text)
if w is not None:
meta = unit.metadata or {}
meta["width"] = w
meta["length"] = ln
meta["sqft"] = sq
unit.metadata = meta
unit.size = size_text
# Price (street rate)
price_idx = col_map.get("price")
if price_idx is not None and price_idx < len(texts):
unit.price = self.normalize_price(texts[price_idx])
# Promo — store as raw text in promotion field
promo_idx = col_map.get("promo")
if promo_idx is not None and promo_idx < len(texts):
promo_text = texts[promo_idx].strip()
if promo_text:
unit.promotion = promo_text
# Availability
avail_idx = col_map.get("avail")
if avail_idx is not None and avail_idx < len(texts):
unit.scarcity = texts[avail_idx]
# Features / amenities
features_idx = col_map.get("features")
feature_text = ""
if features_idx is not None and features_idx < len(texts):
feature_text = texts[features_idx].lower()
# Also check full row text for amenity keywords
full_text = " ".join(texts).lower()
combined = f"{feature_text} {full_text}"
meta = unit.metadata or {}
if any(kw in combined for kw in ["climate", "temperature", "heated", "cooled"]):
meta["climateControlled"] = True
if any(kw in combined for kw in ["drive-up", "drive up", "driveup"]):
meta["driveUpAccess"] = True
if "elevator" in combined:
meta["elevatorAccess"] = True
if any(kw in combined for kw in ["ground floor", "ground-floor", "1st floor", "first floor"]):
meta["groundFloor"] = True
if any(kw in combined for kw in ["indoor", "interior"]):
meta["indoor"] = True
if meta:
unit.metadata = meta
return unit
Scrape Runs (6)
-
exported Run #20352026-03-27 14:00:40.649520 | TableParser
-
exported Run #20342026-03-27 14:00:40.377799 | TableParser
-
exported Run #12832026-03-23 03:02:25.726947 | TableParser
-
exported Run #7902026-03-21 18:54:00.702514 | TableParser
-
exported Run #3392026-03-14 16:34:44.658366 | TableParser
-
failed Run #242026-03-09 20:49:19.623894 | 1 failure(s)
Run #339 Details
- Status
- exported
- Parser Used
- TableParser
- Platform Detected
- table_layout
- Units Found
- 0
- Stage Reached
- exported
- Timestamp
- 2026-03-14 16:34:44.658366
Timing
| Stage | Duration |
|---|---|
| Fetch | 3265ms |
| Detect | 7ms |
| Parse | 3ms |
| Export | 11ms |
Snapshot: 009208_20260314T163447Z.html · Show Snapshot · Open in New Tab
No units found in this run.
All Failures for this Facility (6)
parse
_WarningAsException
scraper
no_units_extracted
warning
Run #N/A | 2026-03-27 14:00:43.740897
No units extracted for 009208
Stack trace
src.reporting.failure_reporter._WarningAsException: No units extracted for 009208
parse
_WarningAsException
scraper
no_units_extracted
warning
Run #N/A | 2026-03-27 14:00:43.176909
No units extracted for 009208
Stack trace
src.reporting.failure_reporter._WarningAsException: No units extracted for 009208
parse
_WarningAsException
scraper
no_units_extracted
warning
Run #N/A | 2026-03-23 03:02:28.262888
No units extracted for 009208
Stack trace
src.reporting.failure_reporter._WarningAsException: No units extracted for 009208
parse
_WarningAsException
scraper
no_units_extracted
warning
Run #N/A | 2026-03-21 18:54:03.667929
No units extracted for 009208
Stack trace
src.reporting.failure_reporter._WarningAsException: No units extracted for 009208
parse
_WarningAsException
scraper
no_units_extracted
warning
Run #N/A | 2026-03-14 16:34:47.967640
No units extracted for 009208
Stack trace
src.reporting.failure_reporter._WarningAsException: No units extracted for 009208
fetch
DatatypeMismatch
unknown
unknown
permanent
Run #24 | 2026-03-09 20:49:23.022572
column "success" is of type boolean but expression is of type integer LINE 3: ... VALUES ('009208', 24, '009208_20260309T204923Z.html', 0) ^ HINT: You will need to rewrite or cast the expression.
Stack trace
Traceback (most recent call last):
File "/app/src/pipeline.py", line 329, in _process_facility
manifest_id = storage.insert_snapshot_manifest(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/app/src/db/pg_backend.py", line 615, in insert_snapshot_manifest
row = self._execute_returning(
^^^^^^^^^^^^^^^^^^^^^^^^
File "/app/src/db/pg_backend.py", line 54, in _execute_returning
cur.execute(sql, params)
File "/app/.venv/lib/python3.11/site-packages/psycopg2/extras.py", line 236, in execute
return super().execute(query, vars)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
psycopg2.errors.DatatypeMismatch: column "success" is of type boolean but expression is of type integer
LINE 3: ... VALUES ('009208', 24, '009208_20260309T204923Z.html', 0)
^
HINT: You will need to rewrite or cast the expression.