Parser Source (src/parsers/storageunitsoftware.py)
"""Parser for StorEdge / StorageUnitSoftware facility pages."""
from __future__ import annotations
import re
from bs4 import BeautifulSoup, Tag
from src.parsers.base import BaseParser, ParseResult, UnitResult
class StorageUnitSoftwareParser(BaseParser):
"""Extract storage units from StorEdge/StorageUnitSoftware HTML pages.
Supports two major layout variants:
1. **Legacy/SPA layout** — unit rows with ``.sus-`` or ``.unit-`` class
prefixes, or ``data-unit-id`` attributes.
2. **Bootstrap card layout** — ``div.card.rounded-0.mb-2`` containers
with ``h4.primary-color`` for the size name and
``strong.price.primary-color`` for pricing. This is the more common
layout seen on modern StorageUnitSoftware white-label sites.
"""
platform = "storageunitsoftware"
def parse(self, html: str, url: str = "") -> ParseResult:
soup = BeautifulSoup(html, "lxml")
result = ParseResult(platform=self.platform, parser_name=self.__class__.__name__)
# Strategy 1: sus-unit-row / sus-unit-card containers
unit_elements = soup.select(".sus-unit-row, .sus-unit-card, .sus-unit-item")
# Strategy 2: unit-list direct children
if not unit_elements:
unit_elements = soup.select(".unit-list .unit-item, .unit-list li, .unit-list tr")
# Strategy 3: storedge-specific containers
if not unit_elements:
unit_elements = soup.select("[class*='storedge-unit'], [class*='se-unit']")
# Strategy 4: Bootstrap card layout (modern SUS white-label sites)
# Two variants exist:
# - div.card.rounded-0.mb-2 (full-width card layout)
# - div.card.rounded-0.p-3 (grid card layout)
# Both contain h4.primary-color for size and strong.price for pricing.
# Checked before data-attribute fallback to avoid matching site-map elements.
if not unit_elements:
candidates = soup.select("div.card.rounded-0")
# Filter to only cards that contain a price element (avoids nav/footer cards)
unit_elements = [
c for c in candidates
if c.select_one("strong.price, .price.primary-color")
]
# Strategy 5: generic unit rows with data attributes (broadest fallback)
if not unit_elements:
unit_elements = soup.select("[data-unit-id], [data-unit]")
if not unit_elements:
result.warnings.append("No unit elements found with StorageUnitSoftware selectors")
return result
for el in unit_elements:
unit = self._parse_unit_element(el)
if unit is not None:
result.units.append(unit)
return result
def _parse_unit_element(self, el: Tag) -> UnitResult | None:
"""Extract a single unit from an element."""
unit = UnitResult()
# Use only the visible card-body text, excluding hidden modal content
card_body = el.select_one(".card-body")
visible_el = card_body if card_body else el
# Exclude hidden modal content from description
for hidden in visible_el.select(".d-none"):
hidden.decompose()
unit.description = visible_el.get_text(separator=" ", strip=True)
# --- Size / Dimensions ---
self._extract_size(el, unit)
# --- Pricing ---
self._extract_pricing(el, unit)
# --- Amenities ---
text_lower = (unit.description or "").lower()
classes_str = " ".join(el.get("class", []))
climate_control = _has_any(text_lower, classes_str, ["climate", "temperature", "heated", "cooled"])
drive_access = _has_any(text_lower, classes_str, ["drive-up", "drive up", "driveup", "drive_up"])
elevator_access = _has_any(text_lower, classes_str, ["elevator"])
ground_floor = _has_any(text_lower, classes_str, ["ground floor", "ground-floor", "1st floor", "first floor"])
indoor = _has_any(text_lower, classes_str, ["indoor", "interior"])
meta = unit.metadata or {}
if climate_control:
meta["climateControlled"] = True
if drive_access:
meta["driveUpAccess"] = True
if elevator_access:
meta["elevatorAccess"] = True
if ground_floor:
meta["groundFloor"] = True
if indoor:
meta["indoor"] = True
if meta:
unit.metadata = meta
# --- Availability ---
self._extract_availability(el, unit)
return unit
def _extract_size(self, el: Tag, unit: UnitResult) -> None:
"""Extract size/dimensions from the unit element."""
# Bootstrap card layout: h4.primary-color contains "Name (W x L)"
heading = el.select_one("h4.primary-color")
if heading:
heading_text = heading.get_text(strip=True)
w, ln, sq = self.normalize_size(heading_text)
if w is not None:
meta = unit.metadata or {}
meta["width"] = w
meta["length"] = ln
meta["sqft"] = sq
unit.metadata = meta
unit.size = heading_text
return
# Legacy layout: elements with size/dimension class names
size_el = (
el.select_one("[class*='size']")
or el.select_one("[class*='dimension']")
or el.select_one(".sus-size")
)
if size_el:
size_text = size_el.get_text(strip=True)
w, ln, sq = self.normalize_size(size_text)
if w is not None:
meta = unit.metadata or {}
meta["width"] = w
meta["length"] = ln
meta["sqft"] = sq
unit.metadata = meta
unit.size = size_text
def _extract_pricing(self, el: Tag, unit: UnitResult) -> None:
"""Extract pricing from the unit element."""
# Bootstrap card layout: strong.price.primary-color contains "$XX / month"
# Some sites show a strikethrough original price: <s>$40</s> <span>$30.31 / month</span>
# We need the discounted price (in <span>) and treat <s> as street rate.
price_strong = el.select_one("strong.price.primary-color") or el.select_one("strong.price")
if price_strong:
# Check for strikethrough (street) + discounted (web) pattern
struck = price_strong.select_one("s")
if struck:
unit.price = self.normalize_price(struck.get_text(strip=True))
# Get price from the span (non-struck) part
price_span = price_strong.select_one("span")
if price_span:
span_text = re.sub(r"/\s*month", "", price_span.get_text(strip=True)).strip()
unit.sale_price = self.normalize_price(span_text)
if unit.sale_price is not None or unit.price is not None:
return
else:
price_text = price_strong.get_text(strip=True)
# Remove "/ month" suffix and leading text like "Starting at"
price_text = re.sub(r"/\s*month\*?", "", price_text).strip()
# Try direct parse first, then regex extraction for "$XX" pattern
unit.sale_price = self.normalize_price(price_text)
if unit.sale_price is None:
price_match = re.search(r"\$([\d,]+(?:\.\d+)?)", price_text)
if price_match:
unit.sale_price = self.normalize_price(price_match.group(1))
if unit.sale_price is not None:
return
# Legacy layout: elements with rate/price class names
street_el = el.select_one("[class*='street-rate']") or el.select_one("[class*='regular']")
if street_el:
unit.price = self.normalize_price(street_el.get_text(strip=True))
web_el = el.select_one("[class*='web-rate']") or el.select_one("[class*='online']")
if web_el:
unit.sale_price = self.normalize_price(web_el.get_text(strip=True))
promo_el = el.select_one("[class*='promo']") or el.select_one("[class*='special']")
if promo_el:
promo_text = promo_el.get_text(strip=True)
unit.promotion = promo_text if promo_text else None
# Fallback: generic price element
if unit.price is None and unit.sale_price is None:
price_el = el.select_one("[class*='price']") or el.select_one("[class*='rate']")
if price_el:
unit.sale_price = self.normalize_price(price_el.get_text(strip=True))
def _extract_availability(self, el: Tag, unit: UnitResult) -> None:
"""Extract availability status from the unit element."""
text = el.get_text(separator=" ", strip=True).lower()
# Bootstrap card layout: button text or link text indicates status
if "rent now" in text:
unit.scarcity = "Available"
elif "waiting list" in text or "waitlist" in text:
unit.scarcity = "Waitlist"
elif "sold out" in text or "no units" in text:
unit.scarcity = "Unavailable"
# Legacy layout: element with avail class
if unit.scarcity is None:
avail_el = el.select_one("[class*='avail']")
if avail_el:
unit.scarcity = avail_el.get_text(strip=True)
def _has_any(text: str, classes: str, keywords: list[str]) -> bool:
"""Check if any keyword appears in the text or CSS classes."""
combined = f"{text} {classes}".lower()
return any(kw in combined for kw in keywords)