Source code for conda_recipe_manager.fetcher.http_artifact_fetcher

"""
:Description: Provides an Artifact Fetcher capable of acquiring a software archive from an HTTP/HTTPS source.
"""

from __future__ import annotations

import tarfile
import zipfile
from enum import Enum, auto
from pathlib import Path
from typing import Final, Iterator, cast
from urllib.parse import urlparse

import requests

from conda_recipe_manager.fetcher.base_artifact_fetcher import BaseArtifactFetcher
from conda_recipe_manager.fetcher.exceptions import FetchError
from conda_recipe_manager.utils.cryptography.hashing import hash_file

# Default download timeout for artifacts
_DOWNLOAD_TIMEOUT: Final[int] = 5 * 60  # 5 minutes



[docs]
class ArtifactArchiveType(Enum):
    """
    Enumerates the types of archive file formats that are supported.
    """

    ZIP = auto()
    # TODO determine how to do this in Python
    ZIP_7 = auto()  # 7zip
    TARBALL = auto()
    UNKNOWN = auto()  # Could not determine the artifact type




[docs]
class HttpArtifactFetcher(BaseArtifactFetcher):
    """
    Artifact Fetcher capable of downloading a software archive from a remote HTTP/HTTPS source.
    """

    def __init__(self, name: str, archive_url: str):
        """
        Constructs an `HttpArtifactFetcher` instance.

        :param name: Identifies the artifact. Ideally, this is the package name. In multi-sourced/mirrored scenarios,
            this might be the package name combined with some identifying information.
        :param archive_url: URL that points to the target software archive.
        """
        super().__init__(name)
        self._archive_url = archive_url
        self._archive_type = ArtifactArchiveType.UNKNOWN

        # We use `urlparse` to extract the file path containing the archive. This can be used to get the archive's file
        # name. Many of the archive files we deal with contain the version number with period markings. We also work
        # with archives with many different file extensions. To avoid the many pitfalls here of trying to calculate the
        # "true basename" of the file, we just pre-pend `extracted_` to indicate this is the folder containing the
        # extracted archive.
        archive_file_name: Final[str] = Path(urlparse(self._archive_url).path).name
        extracted_dir_name: Final[str] = f"extracted_{archive_file_name}"

        self._archive_path: Final[Path] = self._temp_dir_path / archive_file_name
        self._uncompressed_archive_path: Final[Path] = self._temp_dir_path / extracted_dir_name

    def _extract(self) -> None:
        """
        Retrieves the build artifact and source code and dumps it to a secure temporary location.

        :raises FetchError: If an issue occurred while extracting the archive.
        """
        try:
            match self._archive_path:
                case path if tarfile.is_tarfile(path):
                    self._archive_type = ArtifactArchiveType.TARBALL
                    with tarfile.open(self._archive_path, mode="r") as tar_file:
                        # The `filter="data"` parameter guards against "the most dangerous security issues"
                        tar_file.extractall(path=self._uncompressed_archive_path, filter="data")
                case path if zipfile.is_zipfile(path):
                    self._archive_type = ArtifactArchiveType.ZIP
                    with zipfile.ZipFile(self._archive_path) as zip_file:
                        # TODO improve security checks
                        zip_file.extractall(path=self._uncompressed_archive_path)
                # TODO 7-zip support
                case _:
                    raise FetchError("The archive type could not be identified.")
        except (tarfile.TarError, zipfile.BadZipFile, ValueError) as e:
            raise FetchError("An extraction error occurred while extracting the archive.") from e
        except IOError as e:
            raise FetchError("A file system error occurred while extracting the archive.") from e


[docs]
    def fetch(self) -> None:
        """
        Retrieves a software archive from a remote HTTP/HTTPS host and stores the files in a secure temporary directory.

        :raises FetchError: If an issue occurred while downloading or extracting the archive.
        """
        # Buffered download approach
        try:
            response = requests.get(str(self._archive_url), stream=True, timeout=_DOWNLOAD_TIMEOUT)
            with open(self._archive_path, "wb") as archive:
                for chunk in cast(Iterator[bytes], response.iter_content(chunk_size=1024)):
                    if not chunk:
                        break
                    archive.write(chunk)
        except requests.exceptions.RequestException as e:  # type: ignore[misc]
            raise FetchError("An HTTP error occurred while fetching the archive.") from e
        except IOError as e:
            raise FetchError("A file system error occurred while fetching the archive.") from e

        self._extract()

        # If we have not thrown at this point, we have successfully fetched the archive.
        self._successfully_fetched = True



[docs]
    def get_path_to_source_code(self) -> Path:
        """
        Returns the directory containing the artifact's bundled source code.
        NOTE: If the target archive compresses top-level folder that contains the source code, this path will point to a
        directory containing that uncompressed top-level folder.

        :raises FetchRequiredError: If `fetch()` has not been successfully invoked.
        """
        self._fetch_guard("Archive has not been downloaded, so the source code is unavailable.")

        return self._uncompressed_archive_path



[docs]
    def get_archive_sha256(self) -> str:
        """
        Calculates a SHA-256 hash on the downloaded software archive.

        :raises FetchRequiredError: If `fetch()` has not been successfully invoked.
        """
        self._fetch_guard("Archive has not been downloaded, so the file can't be hashed.")

        return hash_file(self._archive_path, "sha256")



[docs]
    def get_archive_type(self) -> ArtifactArchiveType:
        """
        Returns the type of archive that was retrieved. This evaluation was determined by evaluating the file and not by
        the file name.

        :raises FetchRequiredError: If `fetch()` has not been successfully invoked.
        """
        self._fetch_guard("Archive has not been downloaded, so the type can't be determined.")

        return self._archive_type