Source code for routinepy.lib.scraper.parsers.html.class_routine
from bs4 import BeautifulSoup
from routinepy.lib.scraper.models import RawClassRoutineTable
[docs]
class ClassRoutineParser:
"""
Parser for extracting class routine information from university HTML pages.
This parser extracts three main components from the HTML structure:
1. Metadata table (containing routine header information)
2. Main routine table (containing schedule data)
3. Faculty table (containing instructor information)
"""
TABLE_META_SELECTOR = "table#HdtableRtn"
"""CSS selector for the metadata table (contains semester/program info)"""
def __init__(self, html: str):
"""
Initialize the :class:`ClassRoutineParser` with HTML content to be parsed.
:param html: Raw HTML string containing the routine page content.
:type html: str
:raises ValueError: If the provided HTML is empty
"""
if not html.strip():
raise ValueError("HTML content cannot be empty")
self.soup = BeautifulSoup(html, "lxml")
[docs]
def extract_routine_tables(self) -> list[RawClassRoutineTable]:
"""
Extract and package all routine table parts from the HTML.
This method locates and extracts three related HTML tables for each class routine:
- Metadata table: Contains semester, program and section information.
- Main routine table: Contains the class schedule.
- Faculty table: Contains teacher and course details.
:return: A list of :class:`RawClassRoutineTable` objects, each containing
the metadata, routine, and faculty tables for a single class routine.
:rtype: list[RawClassRoutineTable]
"""
extracted_tables = []
table_meta = self.soup.select(self.TABLE_META_SELECTOR)
for table in table_meta:
routine = table.find_next("table")
faculty_table = routine.find_next("table")
extracted_tables.append(
RawClassRoutineTable(
meta=table, routine=routine, faculty_table=faculty_table
)
)
return extracted_tables