Source code for routinepy.lib.scraper.client

from loguru import logger

from routinepy.consts import ROUTINE_PAGE_URL
from routinepy.lib.api.enums import ProgramCode
from routinepy.lib.models import ClassPeriod, TermExam
from routinepy.lib.scraper.parsers.html import ClassRoutineParser, RoutinePageParser
from routinepy.lib.scraper.parsers.pdf import BaseExamPdfParser
from routinepy.lib.scraper.scraper_utils import (
    filter_periods,
    get_shift_name,
    reverse_program_code,
)
from routinepy.lib.scraper.transformers import (
    BaseExamTableTransformer,
    ClassRoutineTableTransformer,
)
from routinepy.lib.utils.http import make_get_request



[docs]
class ScraperClient:
    """
    Client for fetching routine data through web scraping the BUBT webpage
    """


[docs]
    @logger.catch()
    async def get_routine_links(self):
        """
        Extract the links for class, exam, and supplementary exam routines from the
        `university's webpage <https://bubt.edu.bd/home/routines>`_

        :raises ValueError: Failed to get routine webpage
        :return: A dictionary containing three categories of routine links:

            - ``class``: A dictionary with a ``links`` key containing a list of dictionaries,
                each with ``program_code`` (str) and ``link`` (str) for class routines.

            - ``exam``: A dictionary with a ``links`` key containing a list of dictionaries,
                each mapping shift names to program-specific exam routine links.

            - ``sup_exam``: A dictionary with a ``links`` key containing a list of supplementary exam routine links.
        :rtype: dict
        """
        routine_html = await make_get_request(ROUTINE_PAGE_URL)
        if not routine_html:
            raise ValueError("Failed to get routine webpage")

        parser = RoutinePageParser(routine_html)

        class_links = parser.get_class_routine_links()
        exam_links = parser.get_exam_routine_links()
        sup_exam_links = parser.get_sup_exam_routine_links()

        return {"class": class_links, "exam": exam_links, "sup_exam": sup_exam_links}



[docs]
    @logger.catch()
    async def get_class_routine(
        self,
        program_code: ProgramCode,
        course_code: str = None,
        faculty_code: str = None,
        intake: str = None,
        section: str = None,
    ) -> list[ClassPeriod] | None:
        """
        Get class routine data based on provided filters.

        :param program_code: The program code to filter by (e.g., `006`, `001`)
        :type program_code: ProgramCode
        :param course_code: The course code to filter by (e.g., `CSE 101`, `CSE 331`), defaults to None
        :type course_code: str, optional
        :param faculty_code: The faculty member code to filter by (e.g., `MDI`, `MAFI` ), defaults to None
        :type faculty_code: str, optional
        :param intake: The intake number to filter by (e.g., `49`, `50`), defaults to None
        :type intake: str, optional
        :param section: The section number to filter by (e.g., `1`, `2`), defaults to None
        :type section: str, optional
        :raises ValueError: If invalid or incompatible filter combinations are provided
        :return: A list of filtered ClassPeriod objects representing the class routine
        :rtype: list[ClassPeriod] | None

        .. note::
            - Program codes follow the university's standard numbering system
            - Filtering by room number is not implemented as we need to download the routines of all programs.
        """
        if section and not intake:
            raise ValueError("Intake is required when filtering by section")

        try:
            html = await self.get_class_routine_html(program_code)
        except Exception as e:
            msg = f"Failed to download routine HTML for program code {program_code}"
            logger.error(f"{msg}: {e}")
            raise ValueError(msg) from e

        if not html.strip():
            logger.error("Received empty response from BUBT")
            return None

        parser = ClassRoutineParser(html=html)
        table_htmls = parser.extract_routine_tables()
        if not table_htmls:
            logger.error(
                f"No routine raw HTML tables found for program code {program_code}"
            )
            return None

        tables = ClassRoutineTableTransformer().transform_to_models(
            table_htmls, program_code
        )
        if not tables:
            logger.error(f"No routine tables found for program code {program_code}")
            return None

        return filter_periods(tables, course_code, faculty_code, intake, section)



[docs]
    @logger.catch()
    async def get_exam_routine(
        self,
        program_code: ProgramCode,
        course_code: str = None,
        faculty_code: str = None,
        intake: str = None,
        section: str = None,
    ) -> list[TermExam] | None:
        """
        Get exam routine data based on provided filters.

        :param program_code: The program code to filter by (e.g., `006`, `001`)
        :type program_code: ProgramCode
        :param course_code: The course code to filter by (e.g., `CSE 101`, `CSE 331`), defaults to None
        :type course_code: str, optional
        :param faculty_code: The faculty member code to filter by (e.g., `MDI`, `MAFI` ), defaults to None
        :type faculty_code: str, optional
        :param intake: The intake number to filter by (e.g., `49`, `50`), defaults to None
        :type intake: str, optional
        :param section: The section number to filter by (e.g., `1`, `2`), defaults to None
        :type section: str, optional
        :raises ValueError: If invalid or incompatible filter combinations are provided
        :return: A list of filtered TermExam objects representing the exam routine
        :rtype: list[TermExam] | None

        .. note::
            - Program codes follow the university's standard numbering system
            - Filtering by room number is not implemented as it's not possible *yet* to extract and parse varities of routine PDFs.
        .. warning::
            - Only `ProgramCode.CSE_DAY` is currently supported
        """
        if section and not intake:
            raise ValueError("Intake is required when filtering by section")

        # pdf_link = await self.get_exam_routine_pdf_link(program_code)
        pdf_link = "http://localhost:8001/CSE_DAY_4_3_2025.pdf"
        if not pdf_link:
            logger.error(f"No exam pdf link found for program code {program_code}")
            return

        try:
            pdf_path = await make_get_request(pdf_link, is_file=True)
            if not pdf_path:
                raise ValueError("Exam PDF download failed.")
        except Exception:
            logger.error(f"Failed to download the exam routine PDF: {pdf_link}")
            return

        pdf_raw_tables = BaseExamPdfParser().extract_raw_tables(program_code, pdf_path)
        if not pdf_raw_tables:
            raise NotImplementedError(
                f"Sorry, exam routine parser for program {program_code} is not implemented, Try using the API",
            )

        tables = BaseExamTableTransformer().transform_to_models(
            tables=pdf_raw_tables, program_code=program_code
        )
        if not tables:
            raise NotImplementedError(
                f"Sorry, exam routine pdf transformer for program {program_code} is not implemented, Try using the API",
            )

        return filter_periods(tables, course_code, faculty_code, intake, section)



[docs]
    @logger.catch()
    async def get_class_routine_html(self, program_code: ProgramCode):
        """
        Get HTML content for a specific class routine

        :param program_code: The program code to filter by (e.g., `006`, `001`)
        :type program_code: ProgramCode
        :raises ValueError: no class routine links are found
        :raises ValueError: the routine cannot be downloaded
        :raises ValueError: class routine for program not found
        :return: HTML content of the class routine
        :rtype: str
        """
        logger.info(f"Fetching class routine HTML for program code {program_code}")

        routine_links = await self.get_routine_links()
        links = routine_links.get("class", {}).get("links")
        if not links:
            raise ValueError("No class routine links found")

        for i in links:
            if i["program_code"] == program_code:
                link = i["link"]
                if link.endswith(".pdf"):
                    raise ValueError(
                        f"Unsupported class routine url for program code {program_code}: {link}"
                    )

                return await make_get_request(link)

        raise ValueError(f"Class routine not found for program code {program_code}")



[docs]
    @logger.catch()
    async def get_exam_routine_pdf_link(self, program_code: ProgramCode) -> str:
        """
        Get exam routine pdf link from the routine webpage

        :param program_code: The program code to filter by (e.g., `006`, `001`)
        :type program_code: ProgramCode
        :raises ValueError: No exam routine links found from the routine page
        :raises ValueError: No exam pdf link found for the given program code
        :return: the exam routine PDF link of the program
        :rtype: str
        """
        routine_links = await self.get_routine_links()
        links = routine_links.get("exam", {}).get("links")
        if not links:
            raise ValueError("No exam routine links found")

        shift_name = get_shift_name(program_code).value.lower()
        program_name_parts = (
            reverse_program_code(program_code=program_code).lower().split()
        )

        for shift in links:
            for program, program_links in shift.items():
                if shift_name in program.lower():
                    for i in program_links:
                        name, link = list(i.items())[0]
                        name = name.lower().replace("day", "").strip()
                        if name in program_name_parts:
                            return link

        raise ValueError("%s: PDF link not found.", program_code)