Source code for conda_recipe_manager.scanner.dependency.py_dep_scanner

"""
:Description: Provides a Dependency Scanner class capable of finding dependencies in a Python project's source code.
"""

from __future__ import annotations

import ast
import pkgutil
import sys
from pathlib import Path
from typing import Final

from conda_recipe_manager.parser.dependency import DependencySection
from conda_recipe_manager.scanner.dependency.base_dep_scanner import (
    BaseDependencyScanner,
    ProjectDependency,
    new_project_dependency,
)
from conda_recipe_manager.types import MessageCategory

# Table that maps import names that do not match the package name for common packages. See this StackOverflow post for
# more details:
#  https://stackoverflow.com/questions/54886143/why-are-some-python-package-names-different-than-their-import-name
_IMPORT_TO_DEPENDENCY_NAME_TBL: Final[dict[str, str]] = {
    "git": "gitpython",
    "yaml": "pyyaml",
    "PIL": "pillow",
    "sklearn": "scikit-learn",
    "tables": "pytables",
    "cv": "py-opencv",
    "cv2": "py-opencv",
    "OpenGL": "pyopengl",
}



[docs]
class PythonDependencyScanner(BaseDependencyScanner):
    """
    Dependency Scanner class capable of scanning Python source code.
    """

    @staticmethod
    def _correct_module_to_dependency(module: str) -> str:
        """
        Corrects common dependency names that are not the same as their imported name.

        :param module: Module name to correct
        :returns: The corrected name, if one is found. Otherwise, the original string.
        """
        if module not in _IMPORT_TO_DEPENDENCY_NAME_TBL:
            return module
        return _IMPORT_TO_DEPENDENCY_NAME_TBL[module]

    @staticmethod
    def _is_likely_test_file(file: Path) -> bool:
        """
        Attempts to determine if a Python file is a test file.

        :param file: Path to the file to check
        :returns: True if we determine that this file/path likely points to a test file.
        """
        # NOTE: This is by no means a perfect function. We will have to iterate on this approach over time.

        sanitized_name: Final[str] = file.name.lower()
        if sanitized_name.startswith("test_") or sanitized_name.endswith("_test.py"):
            return True

        # TODO: Check with the `ast` library if pytest, unittest, pyfakefs, etc are imported(?)

        return False

    def __init__(self, src_dir: Path | str):
        """
        Constructs a `PythonDependencyScanner`.

        :param src_dir: Path to the Python source code to scan.
        """
        super().__init__()
        self._src_dir: Final[Path] = Path(src_dir)

    def _get_project_modules(self) -> set[str]:
        """
        Calculates the set of module names found in this project. These will not need to be listed as dependencies in
        the recipe file (as they are a part of the project).

        :returns: A set of unique dependencies defined in this project's source code.
        """
        return {name for _, name, _ in pkgutil.iter_modules([str(self._src_dir)])}

    def _scan_one_file(self, file: Path) -> set[ProjectDependency]:
        """
        Helper function that scans one Python file for dependencies.

        :returns: Set of project dependencies found in the target Python file.
        """
        deps: set[ProjectDependency] = set()
        project_modules: Final[set[str]] = self._get_project_modules()
        # Adapted from:
        #   https://stackoverflow.com/questions/9008451/python-easy-way-to-read-all-import-statements-from-py-module
        root = ast.parse(file.read_text(), file)

        for node in ast.walk(root):
            if not isinstance(node, (ast.Import, ast.ImportFrom)):
                continue

            module_names = []
            if isinstance(node, ast.Import):
                # Handle multiple (comma-separated) imports on one line
                for alias in node.names:
                    module_names.append(alias.name.split(".")[0])
            elif node.module is not None:
                module_names.append(node.module.split(".")[0])

            for module_name in module_names:
                # TODO filter relative imports
                # Filter-out the standard library modules and local module names (i.e. modules defined in the target
                # project).
                if not module_name or module_name in sys.stdlib_module_names or module_name in project_modules:
                    continue

                package_name = PythonDependencyScanner._correct_module_to_dependency(module_name)

                # Most Python imports fall under the `run` section in the Conda recipe format. The major exception is
                # any import found in test code.
                dep_type = (
                    DependencySection.TESTS
                    if PythonDependencyScanner._is_likely_test_file(file)
                    else DependencySection.RUN
                )

                deps.add(new_project_dependency(package_name, dep_type))

        return deps


[docs]
    def scan(self) -> set[ProjectDependency]:
        """
        Actively scans a project for dependencies.

        :returns: A set of unique dependencies found by the scanner.
        """
        # TODO parallelize this? Some preliminary performance tests show conflicting results using `multiprocessing`
        # pools. Very large Python projects can see a 50% reduction in scanning while small projects take a 30%-40% hit
        # in speed with spin-up costs.
        all_imports: set[ProjectDependency] = set()
        for file in self._src_dir.rglob("*.py"):
            try:
                all_imports |= self._scan_one_file(file)
            except Exception as e:  # pylint: disable=broad-exception-caught
                self._msg_tbl.add_message(
                    MessageCategory.EXCEPTION, f"Exception encountered while scanning `{file}`: {e}"
                )

        # `RUN` dependencies are automatically added as `TEST` dependencies, so we need to filter if there are
        # (effectively) duplicates
        def _filter_test_duplicates(dep: ProjectDependency) -> bool:
            if (
                dep.type == DependencySection.TESTS
                and ProjectDependency(dep.data, DependencySection.RUN) in all_imports
            ):
                return False
            return True

        all_imports = set(filter(_filter_test_duplicates, all_imports))

        # TODO determine if users care to attempt to determine if `types-*` packages are to be included for common
        # libraries.

        # TODO filter unused imports

        # Python is inherently a HOST and RUN dependency for all Python projects.
        all_imports.add(new_project_dependency("python", DependencySection.HOST))
        all_imports.add(new_project_dependency("python", DependencySection.RUN))

        return all_imports