Source code for conda_recipe_manager.fetcher.http_artifact_fetcher

"""
:Description: Provides an Artifact Fetcher capable of acquiring a software archive from an HTTP/HTTPS source.
"""

from __future__ import annotations

import tarfile
import zipfile
from enum import Enum, auto
from pathlib import Path
from typing import Final, Iterator, cast
from urllib.parse import urlparse

import requests

from conda_recipe_manager.fetcher.base_artifact_fetcher import BaseArtifactFetcher
from conda_recipe_manager.fetcher.exceptions import FetchError
from conda_recipe_manager.utils.cryptography.hashing import hash_file

# Default download timeout for artifacts
_DOWNLOAD_TIMEOUT: Final[int] = 5 * 60  # 5 minutes


[docs] class ArtifactArchiveType(Enum): """ Enumerates the types of archive file formats that are supported. """ ZIP = auto() # TODO determine how to do this in Python ZIP_7 = auto() # 7zip TARBALL = auto() UNKNOWN = auto() # Could not determine the artifact type
[docs] class HttpArtifactFetcher(BaseArtifactFetcher): """ Artifact Fetcher capable of downloading a software archive from a remote HTTP/HTTPS source. """ def __init__(self, name: str, archive_url: str): """ Constructs an `HttpArtifactFetcher` instance. :param name: Identifies the artifact. Ideally, this is the package name. In multi-sourced/mirrored scenarios, this might be the package name combined with some identifying information. :param archive_url: URL that points to the target software archive. """ super().__init__(name) self._archive_url = archive_url self._archive_type = ArtifactArchiveType.UNKNOWN # We use `urlparse` to extract the file path containing the archive. This can be used to get the archive's file # name. Many of the archive files we deal with contain the version number with period markings. We also work # with archives with many different file extensions. To avoid the many pitfalls here of trying to calculate the # "true basename" of the file, we just pre-pend `extracted_` to indicate this is the folder containing the # extracted archive. archive_file_name: Final[str] = Path(urlparse(self._archive_url).path).name extracted_dir_name: Final[str] = f"extracted_{archive_file_name}" self._archive_path: Final[Path] = self._temp_dir_path / archive_file_name self._uncompressed_archive_path: Final[Path] = self._temp_dir_path / extracted_dir_name def _extract(self) -> None: """ Retrieves the build artifact and source code and dumps it to a secure temporary location. :raises FetchError: If an issue occurred while extracting the archive. """ try: match self._archive_path: case path if tarfile.is_tarfile(path): self._archive_type = ArtifactArchiveType.TARBALL with tarfile.open(self._archive_path, mode="r") as tar_file: # The `filter="data"` parameter guards against "the most dangerous security issues" tar_file.extractall(path=self._uncompressed_archive_path, filter="data") case path if zipfile.is_zipfile(path): self._archive_type = ArtifactArchiveType.ZIP with zipfile.ZipFile(self._archive_path) as zip_file: # TODO improve security checks zip_file.extractall(path=self._uncompressed_archive_path) # TODO 7-zip support case _: raise FetchError("The archive type could not be identified.") except (tarfile.TarError, zipfile.BadZipFile, ValueError) as e: raise FetchError("An extraction error occurred while extracting the archive.") from e except IOError as e: raise FetchError("A file system error occurred while extracting the archive.") from e
[docs] def fetch(self) -> None: """ Retrieves a software archive from a remote HTTP/HTTPS host and stores the files in a secure temporary directory. :raises FetchError: If an issue occurred while downloading or extracting the archive. """ # Buffered download approach try: response = requests.get(str(self._archive_url), stream=True, timeout=_DOWNLOAD_TIMEOUT) with open(self._archive_path, "wb") as archive: for chunk in cast(Iterator[bytes], response.iter_content(chunk_size=1024)): if not chunk: break archive.write(chunk) except requests.exceptions.RequestException as e: # type: ignore[misc] raise FetchError("An HTTP error occurred while fetching the archive.") from e except IOError as e: raise FetchError("A file system error occurred while fetching the archive.") from e self._extract() # If we have not thrown at this point, we have successfully fetched the archive. self._successfully_fetched = True
[docs] def get_path_to_source_code(self) -> Path: """ Returns the directory containing the artifact's bundled source code. NOTE: If the target archive compresses top-level folder that contains the source code, this path will point to a directory containing that uncompressed top-level folder. :raises FetchRequiredError: If `fetch()` has not been successfully invoked. """ self._fetch_guard("Archive has not been downloaded, so the source code is unavailable.") return self._uncompressed_archive_path
[docs] def get_archive_sha256(self) -> str: """ Calculates a SHA-256 hash on the downloaded software archive. :raises FetchRequiredError: If `fetch()` has not been successfully invoked. """ self._fetch_guard("Archive has not been downloaded, so the file can't be hashed.") return hash_file(self._archive_path, "sha256")
[docs] def get_archive_type(self) -> ArtifactArchiveType: """ Returns the type of archive that was retrieved. This evaluation was determined by evaluating the file and not by the file name. :raises FetchRequiredError: If `fetch()` has not been successfully invoked. """ self._fetch_guard("Archive has not been downloaded, so the type can't be determined.") return self._archive_type