Facility: 040664
Greenhurst Mini Storage
- Facility ID
- 040664
- Name
- Greenhurst Mini Storage
- URL
- http://www.greenhurstministorage.com/
- Address
- N/A
- Platform
- table_layout
- Parser File
- src/parsers/table_parser.py
- Last Scraped
- 2026-03-14 05:00:43.078114
- Created
- 2026-03-06 23:45:35.865957
- Updated
- 2026-03-20 23:23:10.660695
- Parser Status
- ✓ Working
- Status Reason
- Auto-classified via platform detection: table_layout
- Last Healing Attempt
- Not attempted
Parser Source (src/parsers/table_parser.py)
"""Generic fallback parser for table-based storage unit listings."""
from __future__ import annotations
import re
from bs4 import BeautifulSoup, Tag
from src.parsers.base import BaseParser, ParseResult, UnitResult
# Column header patterns used to identify relevant tables and map columns.
_SIZE_PATTERNS = re.compile(r"size|dimension|unit\s*type|width.*length", re.IGNORECASE)
_PRICE_PATTERNS = re.compile(r"price|rate|cost|rent|monthly", re.IGNORECASE)
_PROMO_PATTERNS = re.compile(r"promo|special|discount|sale|web", re.IGNORECASE)
_AVAIL_PATTERNS = re.compile(r"avail|status|vacancy", re.IGNORECASE)
_FEATURE_PATTERNS = re.compile(r"feature|amenity|type|detail|description", re.IGNORECASE)
class TableParser(BaseParser):
"""Extract storage units from HTML tables.
This is the catch-all fallback parser. It scans the page for ``<table>``
elements whose headers suggest they contain unit listings (size/price columns),
then maps rows to :class:`UnitResult` objects.
"""
platform = "table_layout"
def parse(self, html: str, url: str = "") -> ParseResult:
soup = BeautifulSoup(html, "lxml")
result = ParseResult(platform=self.platform, parser_name=self.__class__.__name__)
tables = soup.find_all("table")
if not tables:
result.warnings.append("No HTML tables found on page")
return result
for table in tables:
headers = self._extract_headers(table)
if not headers:
continue
col_map = self._map_columns(headers)
if not col_map.get("size") and not col_map.get("price"):
# This table does not look like a unit listing.
continue
rows = table.select("tbody tr") or table.select("tr")[1:] # skip header row
for row in rows:
cells = row.find_all(["td", "th"])
if not cells:
continue
unit = self._row_to_unit(cells, col_map)
if unit is not None:
result.units.append(unit)
if not result.units:
result.warnings.append("Tables found but none appeared to contain unit listings")
return result
# ------------------------------------------------------------------
# Internal helpers
# ------------------------------------------------------------------
@staticmethod
def _extract_headers(table: Tag) -> list[str]:
"""Get header text from the first ``<thead>`` row or the first ``<tr>``."""
thead = table.find("thead")
if thead:
return [th.get_text(strip=True) for th in thead.find_all(["th", "td"])]
first_row = table.find("tr")
if first_row:
ths = first_row.find_all("th")
if ths:
return [th.get_text(strip=True) for th in ths]
return []
@staticmethod
def _map_columns(headers: list[str]) -> dict[str, int]:
"""Map semantic column names to header indices."""
col_map: dict[str, int] = {}
for idx, header in enumerate(headers):
if _SIZE_PATTERNS.search(header) and "size" not in col_map:
col_map["size"] = idx
elif _PROMO_PATTERNS.search(header) and "promo" not in col_map:
col_map["promo"] = idx
elif _PRICE_PATTERNS.search(header) and "price" not in col_map:
col_map["price"] = idx
elif _AVAIL_PATTERNS.search(header) and "avail" not in col_map:
col_map["avail"] = idx
elif _FEATURE_PATTERNS.search(header) and "features" not in col_map:
col_map["features"] = idx
return col_map
def _row_to_unit(self, cells: list[Tag], col_map: dict[str, int]) -> UnitResult | None:
"""Convert a table row into a UnitResult using the column mapping."""
texts = [c.get_text(strip=True) for c in cells]
if all(t == "" for t in texts):
return None
unit = UnitResult()
unit.description = " | ".join(texts)
# Size
size_idx = col_map.get("size")
if size_idx is not None and size_idx < len(texts):
size_text = texts[size_idx]
w, ln, sq = self.normalize_size(size_text)
if w is not None:
meta = unit.metadata or {}
meta["width"] = w
meta["length"] = ln
meta["sqft"] = sq
unit.metadata = meta
unit.size = size_text
# Price (street rate)
price_idx = col_map.get("price")
if price_idx is not None and price_idx < len(texts):
unit.price = self.normalize_price(texts[price_idx])
# Promo — store as raw text in promotion field
promo_idx = col_map.get("promo")
if promo_idx is not None and promo_idx < len(texts):
promo_text = texts[promo_idx].strip()
if promo_text:
unit.promotion = promo_text
# Availability
avail_idx = col_map.get("avail")
if avail_idx is not None and avail_idx < len(texts):
unit.scarcity = texts[avail_idx]
# Features / amenities
features_idx = col_map.get("features")
feature_text = ""
if features_idx is not None and features_idx < len(texts):
feature_text = texts[features_idx].lower()
# Also check full row text for amenity keywords
full_text = " ".join(texts).lower()
combined = f"{feature_text} {full_text}"
meta = unit.metadata or {}
if any(kw in combined for kw in ["climate", "temperature", "heated", "cooled"]):
meta["climateControlled"] = True
if any(kw in combined for kw in ["drive-up", "drive up", "driveup"]):
meta["driveUpAccess"] = True
if "elevator" in combined:
meta["elevatorAccess"] = True
if any(kw in combined for kw in ["ground floor", "ground-floor", "1st floor", "first floor"]):
meta["groundFloor"] = True
if any(kw in combined for kw in ["indoor", "interior"]):
meta["indoor"] = True
if meta:
unit.metadata = meta
return unit
Scrape Runs (2)
Run #89 Details
- Status
- exported
- Parser Used
- TableParser
- Platform Detected
- table_layout
- Units Found
- 0
- Stage Reached
- exported
- Timestamp
- 2026-03-14 01:02:16.791170
Timing
| Stage | Duration |
|---|---|
| Fetch | 2495ms |
| Detect | 1ms |
| Parse | 0ms |
| Export | 11ms |
Snapshot: 040664_20260314T010219Z.html · Show Snapshot · Open in New Tab
No units found in this run.
All Failures for this Facility (2)
parse
_WarningAsException
scraper
no_units_extracted
warning
Run #N/A | 2026-03-14 05:00:43.074642
No units extracted for 040664
Stack trace
src.reporting.failure_reporter._WarningAsException: No units extracted for 040664
parse
_WarningAsException
scraper
no_units_extracted
warning
Run #N/A | 2026-03-14 01:02:19.324509
No units extracted for 040664
Stack trace
src.reporting.failure_reporter._WarningAsException: No units extracted for 040664