From a69950194108c2669e72090d813f0eac22aa1be7 Mon Sep 17 00:00:00 2001 From: llbbl Date: Sun, 15 Jun 2025 11:49:59 -0500 Subject: [PATCH] feat: Set up comprehensive Python testing infrastructure with Poetry - Migrate from setup.py to Poetry package manager in pyproject.toml - Add pytest, pytest-cov, and pytest-mock as dev dependencies - Configure pytest with custom markers (unit, integration, slow) - Set up coverage reporting with 80% threshold and multiple formats - Create tests/ directory structure with unit/ and integration/ subdirs - Add comprehensive shared fixtures in conftest.py - Update .gitignore with testing and development artifacts - Create validation tests to verify infrastructure setup - Configure Poetry scripts for 'test' and 'tests' commands - Document testing setup and known ARM64 compatibility issues --- .gitignore | 45 +++++++ pyproject.toml | 124 ++++++++++++++++++ tests/README.md | 94 +++++++++++++ tests/__init__.py | 0 tests/conftest.py | 233 +++++++++++++++++++++++++++++++++ tests/integration/__init__.py | 0 tests/test_minimal_setup.py | 37 ++++++ tests/test_setup_validation.py | 172 ++++++++++++++++++++++++ tests/unit/__init__.py | 0 9 files changed, 705 insertions(+) create mode 100644 pyproject.toml create mode 100644 tests/README.md create mode 100644 tests/__init__.py create mode 100644 tests/conftest.py create mode 100644 tests/integration/__init__.py create mode 100644 tests/test_minimal_setup.py create mode 100644 tests/test_setup_validation.py create mode 100644 tests/unit/__init__.py diff --git a/.gitignore b/.gitignore index 794312a6..a35684be 100644 --- a/.gitignore +++ b/.gitignore @@ -14,3 +14,48 @@ AUTHORS ChangeLog .DS_Store .mypy_cache + +# Testing artifacts +.pytest_cache/ +.coverage +htmlcov/ +coverage.xml +*.cover +*.py,cover +.hypothesis/ +pytest_cache/ + +# Claude settings +.claude/* + +# Poetry +# Note: Do not ignore poetry.lock - it should be committed + +# Virtual environments +venv/ +ENV/ +env/ +.venv/ +.env + +# IDE specific files +.vscode/ +*.sublime-project +*.sublime-workspace + +# Build artifacts +__pycache__/ +*.so +*.dylib +*.dll + +# OS specific +Thumbs.db +.DS_Store +Desktop.ini + +# Temporary files +*.tmp +*.temp +*.log +.cache/ diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..24cfa076 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,124 @@ +[tool.poetry] +name = "tensorflow-transform" +version = "1.17.1" +description = "A library for data preprocessing with TensorFlow" +authors = ["Google Inc. "] +license = "Apache-2.0" +readme = "README.md" +homepage = "https://www.tensorflow.org/tfx/transform/get_started" +repository = "https://github.com/tensorflow/transform" +keywords = ["tensorflow", "transform", "tfx"] +classifiers = [ + "Development Status :: 5 - Production/Stable", + "Intended Audience :: Developers", + "Intended Audience :: Education", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: Apache Software License", + "Operating System :: OS Independent", + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3 :: Only", + "Topic :: Scientific/Engineering", + "Topic :: Scientific/Engineering :: Artificial Intelligence", + "Topic :: Scientific/Engineering :: Mathematics", + "Topic :: Software Development", + "Topic :: Software Development :: Libraries", + "Topic :: Software Development :: Libraries :: Python Modules", +] +packages = [{include = "tensorflow_transform"}] + +[tool.poetry.dependencies] +python = ">=3.9,<4" +absl-py = ">=0.9,<2.0.0" +apache-beam = {version = ">=2.53,<3", extras = ["gcp"], python = ">=3.11"} +numpy = ">=1.22.0" +protobuf = [ + {version = ">=4.25.2,<6.0.0", python = ">=3.11"}, + {version = ">=4.21.6,<6.0.0", python = "<3.11"} +] +pyarrow = ">=10,<11" +pydot = ">=1.2,<2" +tensorflow = ">=2.17,<2.18" +tensorflow-metadata = ">=1.15.0" +tf-keras = ">=2" +tfx-bsl = ">=1.15.0" + +[tool.poetry.group.dev.dependencies] +pytest = "^8.0.0" +pytest-cov = "^4.1.0" +pytest-mock = "^3.12.0" + +[tool.poetry.scripts] +test = "pytest:main" +tests = "pytest:main" + +[tool.pytest.ini_options] +minversion = "8.0" +addopts = [ + "--strict-markers", + "--tb=short", + "--cov=tensorflow_transform", + "--cov-report=term-missing:skip-covered", + "--cov-report=html", + "--cov-report=xml", + "--cov-fail-under=80", + "--doctest-modules", + "-v" +] +testpaths = ["tests"] +python_files = ["test_*.py", "*_test.py"] +python_classes = ["Test*"] +python_functions = ["test_*"] +markers = [ + "unit: marks tests as unit tests (fast, isolated)", + "integration: marks tests as integration tests (may require external resources)", + "slow: marks tests as slow (deselect with '-m \"not slow\"')" +] +filterwarnings = [ + "ignore::DeprecationWarning", + "ignore::PendingDeprecationWarning" +] + +[tool.coverage.run] +source = ["tensorflow_transform"] +branch = true +parallel = true +omit = [ + "*/tests/*", + "*/test_*.py", + "*/*_test.py", + "*/setup.py", + "*/version.py", + "*/__pycache__/*", + "*/site-packages/*" +] + +[tool.coverage.report] +exclude_lines = [ + "pragma: no cover", + "def __repr__", + "def __str__", + "raise AssertionError", + "raise NotImplementedError", + "if __name__ == .__main__.:", + "if TYPE_CHECKING:", + "if typing.TYPE_CHECKING:", + "@abstractmethod", + "@abc.abstractmethod" +] +precision = 2 +skip_covered = true +show_missing = true + +[tool.coverage.html] +directory = "htmlcov" + +[tool.coverage.xml] +output = "coverage.xml" + +[build-system] +requires = ["poetry-core>=1.0.0"] +build-backend = "poetry.core.masonry.api" \ No newline at end of file diff --git a/tests/README.md b/tests/README.md new file mode 100644 index 00000000..044b9ad1 --- /dev/null +++ b/tests/README.md @@ -0,0 +1,94 @@ +# TensorFlow Transform Testing Infrastructure + +This directory contains the testing infrastructure for TensorFlow Transform. + +## Structure + +``` +tests/ +├── README.md # This file +├── __init__.py # Package marker +├── conftest.py # Shared pytest fixtures and configuration +├── test_setup_validation.py # Full validation tests (requires all dependencies) +├── test_minimal_setup.py # Minimal tests that work without all dependencies +├── unit/ # Unit tests directory +│ └── __init__.py +└── integration/ # Integration tests directory + └── __init__.py +``` + +## Running Tests + +### Using Poetry Scripts + +```bash +# Run all tests +poetry run test + +# Alternative command (both work) +poetry run tests + +# Run specific test file +poetry run test tests/test_minimal_setup.py + +# Run with specific markers +poetry run test -m unit +poetry run test -m "not slow" +``` + +### Using pytest directly + +```bash +# Run all tests +python -m pytest + +# Run with coverage +python -m pytest --cov=tensorflow_transform + +# Run without coverage (useful for debugging) +python -m pytest --no-cov +``` + +## Test Markers + +- `@pytest.mark.unit` - Fast, isolated unit tests +- `@pytest.mark.integration` - Integration tests that may require external resources +- `@pytest.mark.slow` - Tests that take a long time to run + +## Available Fixtures + +See `conftest.py` for all available fixtures. Key fixtures include: + +- `temp_dir` - Temporary directory that's cleaned up after test +- `temp_file` - Temporary file that's cleaned up after test +- `mock_config` - Sample configuration dictionary +- `sample_data` - Sample data for testing transformations +- `tf_example_data` - Temporary TFRecord file with example data +- `mock_preprocessing_fn` - Simple preprocessing function for testing +- `mock_schema` - Simple schema for testing + +## Coverage Configuration + +Coverage is configured to: +- Require 80% minimum coverage +- Generate HTML reports in `htmlcov/` +- Generate XML report as `coverage.xml` +- Exclude test files and common patterns from coverage + +## Known Issues + +### ARM64 Architecture Support + +Some dependencies like `tfx-bsl` may not have pre-built wheels for ARM64 architecture (e.g., Apple Silicon Macs, ARM Linux). If you encounter installation issues: + +1. Try running the minimal test suite: `poetry run test tests/test_minimal_setup.py --no-cov` +2. Consider using x86_64 emulation or a compatible environment +3. Build dependencies from source if needed + +## Writing New Tests + +1. Place unit tests in `tests/unit/` +2. Place integration tests in `tests/integration/` +3. Use appropriate markers (`@pytest.mark.unit`, etc.) +4. Import and use fixtures from `conftest.py` +5. Follow existing test patterns and naming conventions \ No newline at end of file diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 00000000..e0ca1623 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,233 @@ +"""Pytest configuration and shared fixtures for TensorFlow Transform tests.""" + +import os +import shutil +import tempfile +from pathlib import Path +from typing import Generator, Any, Dict + +import pytest +import tensorflow as tf + + +@pytest.fixture +def temp_dir() -> Generator[Path, None, None]: + """Create a temporary directory for test files. + + Yields: + Path: Path to the temporary directory that will be cleaned up after test. + """ + temp_path = tempfile.mkdtemp() + yield Path(temp_path) + shutil.rmtree(temp_path, ignore_errors=True) + + +@pytest.fixture +def temp_file() -> Generator[Path, None, None]: + """Create a temporary file for testing. + + Yields: + Path: Path to the temporary file that will be cleaned up after test. + """ + fd, temp_path = tempfile.mkstemp() + os.close(fd) + yield Path(temp_path) + if os.path.exists(temp_path): + os.unlink(temp_path) + + +@pytest.fixture +def mock_config() -> Dict[str, Any]: + """Provide a mock configuration dictionary for testing. + + Returns: + Dict[str, Any]: A sample configuration dictionary. + """ + return { + "batch_size": 32, + "learning_rate": 0.001, + "epochs": 10, + "features": ["feature1", "feature2", "feature3"], + "label": "target", + "model_dir": "/tmp/model", + "preprocessing": { + "normalize": True, + "scale": True, + "bucketize": False + } + } + + +@pytest.fixture +def sample_data() -> Dict[str, Any]: + """Generate sample data for testing transformations. + + Returns: + Dict[str, Any]: A dictionary containing sample features and labels. + """ + return { + "numeric_feature": [1.0, 2.0, 3.0, 4.0, 5.0], + "categorical_feature": ["A", "B", "A", "C", "B"], + "text_feature": ["hello world", "tensorflow transform", "test data"], + "label": [0, 1, 0, 1, 1] + } + + +@pytest.fixture +def tf_example_data() -> Generator[str, None, None]: + """Create a temporary TFRecord file with example data. + + Yields: + str: Path to the temporary TFRecord file. + """ + import tensorflow as tf + + temp_path = tempfile.mktemp(suffix=".tfrecord") + + # Create sample TF examples + writer = tf.io.TFRecordWriter(temp_path) + + for i in range(5): + example = tf.train.Example( + features=tf.train.Features( + feature={ + "numeric_feature": tf.train.Feature( + float_list=tf.train.FloatList(value=[float(i)]) + ), + "categorical_feature": tf.train.Feature( + bytes_list=tf.train.BytesList(value=[f"category_{i}".encode()]) + ), + "label": tf.train.Feature( + int64_list=tf.train.Int64List(value=[i % 2]) + ) + } + ) + ) + writer.write(example.SerializeToString()) + + writer.close() + + yield temp_path + + if os.path.exists(temp_path): + os.unlink(temp_path) + + +@pytest.fixture +def mock_transform_output_dir(temp_dir: Path) -> Path: + """Create a mock transform output directory structure. + + Args: + temp_dir: Temporary directory fixture. + + Returns: + Path: Path to the transform output directory. + """ + output_dir = temp_dir / "transform_output" + output_dir.mkdir() + + # Create expected subdirectories + (output_dir / "transformed_metadata").mkdir() + (output_dir / "transform_fn").mkdir() + (output_dir / "transformed_data").mkdir() + + return output_dir + + +@pytest.fixture(autouse=True) +def reset_tensorflow_state(): + """Reset TensorFlow state between tests to avoid interference.""" + yield + tf.keras.backend.clear_session() + + +@pytest.fixture +def mock_preprocessing_fn(): + """Provide a simple preprocessing function for testing. + + Returns: + callable: A preprocessing function that applies basic transformations. + """ + def preprocessing_fn(inputs): + """Simple preprocessing function for testing.""" + import tensorflow_transform as tft + + outputs = {} + + # Normalize numeric features + if "numeric_feature" in inputs: + outputs["numeric_feature_normalized"] = tft.scale_to_z_score( + inputs["numeric_feature"] + ) + + # Vocabulary for categorical features + if "categorical_feature" in inputs: + outputs["categorical_feature_integerized"] = tft.compute_and_apply_vocabulary( + inputs["categorical_feature"] + ) + + # Pass through labels + if "label" in inputs: + outputs["label"] = inputs["label"] + + return outputs + + return preprocessing_fn + + +@pytest.fixture +def mock_schema(): + """Provide a mock schema for testing. + + Returns: + schema_pb2.Schema: A simple schema for testing. + """ + from tensorflow_metadata.proto.v0 import schema_pb2 + + schema = schema_pb2.Schema() + + # Add numeric feature + numeric_feature = schema.feature.add() + numeric_feature.name = "numeric_feature" + numeric_feature.type = schema_pb2.FLOAT + + # Add categorical feature + categorical_feature = schema.feature.add() + categorical_feature.name = "categorical_feature" + categorical_feature.type = schema_pb2.BYTES + + # Add label + label_feature = schema.feature.add() + label_feature.name = "label" + label_feature.type = schema_pb2.INT + + return schema + + +# Markers for test organization +def pytest_configure(config): + """Configure custom pytest markers.""" + config.addinivalue_line( + "markers", "unit: mark test as a unit test (fast, isolated)" + ) + config.addinivalue_line( + "markers", "integration: mark test as an integration test (may require external resources)" + ) + config.addinivalue_line( + "markers", "slow: mark test as slow (deselect with '-m \"not slow\"')" + ) + + +# Test collection hooks +def pytest_collection_modifyitems(config, items): + """Modify test collection to add markers based on test location.""" + for item in items: + # Add markers based on test file location + if "unit" in str(item.fspath): + item.add_marker(pytest.mark.unit) + elif "integration" in str(item.fspath): + item.add_marker(pytest.mark.integration) + + # Add slow marker for tests with "slow" in their name + if "slow" in item.name.lower(): + item.add_marker(pytest.mark.slow) \ No newline at end of file diff --git a/tests/integration/__init__.py b/tests/integration/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/test_minimal_setup.py b/tests/test_minimal_setup.py new file mode 100644 index 00000000..574da859 --- /dev/null +++ b/tests/test_minimal_setup.py @@ -0,0 +1,37 @@ +"""Minimal validation tests that can run without all dependencies.""" + +import os +import sys +from pathlib import Path + +import pytest + + +class TestMinimalSetup: + """Minimal tests to validate basic setup.""" + + def test_pytest_works(self): + """Basic test to ensure pytest is functional.""" + assert 1 + 1 == 2 + + def test_project_structure(self): + """Test that basic project structure is in place.""" + root = Path(__file__).parent.parent + assert (root / "tests").exists() + assert (root / "pyproject.toml").exists() + assert (root / ".gitignore").exists() + + def test_fixtures_work(self, tmp_path): + """Test that basic pytest fixtures work.""" + test_file = tmp_path / "test.txt" + test_file.write_text("hello") + assert test_file.read_text() == "hello" + + @pytest.mark.unit + def test_markers_work(self): + """Test that custom markers are functional.""" + assert True + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) \ No newline at end of file diff --git a/tests/test_setup_validation.py b/tests/test_setup_validation.py new file mode 100644 index 00000000..57e7fb8b --- /dev/null +++ b/tests/test_setup_validation.py @@ -0,0 +1,172 @@ +"""Validation tests to verify the testing infrastructure is properly set up.""" + +import os +import sys +from pathlib import Path + +import pytest + + +class TestInfrastructureSetup: + """Test class to validate the testing infrastructure setup.""" + + def test_pytest_is_installed(self): + """Verify pytest is installed and importable.""" + import pytest + assert pytest.__version__ + + def test_pytest_cov_is_installed(self): + """Verify pytest-cov is installed and importable.""" + import pytest_cov + assert pytest_cov + + def test_pytest_mock_is_installed(self): + """Verify pytest-mock is installed and importable.""" + import pytest_mock + assert pytest_mock + + def test_project_structure_exists(self): + """Verify the expected project structure exists.""" + project_root = Path(__file__).parent.parent + + # Check main directories + assert project_root.exists() + assert (project_root / "tensorflow_transform").exists() + assert (project_root / "tests").exists() + assert (project_root / "tests" / "unit").exists() + assert (project_root / "tests" / "integration").exists() + + # Check configuration files + assert (project_root / "pyproject.toml").exists() + assert (project_root / ".gitignore").exists() + + def test_conftest_fixtures_available(self, temp_dir, mock_config, sample_data): + """Verify conftest fixtures are available and working.""" + # Test temp_dir fixture + assert temp_dir.exists() + assert temp_dir.is_dir() + + # Test mock_config fixture + assert isinstance(mock_config, dict) + assert "batch_size" in mock_config + assert "features" in mock_config + + # Test sample_data fixture + assert isinstance(sample_data, dict) + assert "numeric_feature" in sample_data + assert "categorical_feature" in sample_data + + def test_markers_are_registered(self): + """Verify custom markers are properly registered.""" + markers = [mark.name for mark in pytest.mark._markers] + assert "unit" in markers + assert "integration" in markers + assert "slow" in markers + + @pytest.mark.unit + def test_unit_marker_works(self): + """Test that unit marker can be applied.""" + assert True + + @pytest.mark.integration + def test_integration_marker_works(self): + """Test that integration marker can be applied.""" + assert True + + @pytest.mark.slow + def test_slow_marker_works(self): + """Test that slow marker can be applied.""" + assert True + + def test_tensorflow_import(self): + """Verify TensorFlow can be imported.""" + import tensorflow as tf + assert tf.__version__ + + def test_tensorflow_transform_import(self): + """Verify tensorflow_transform can be imported.""" + sys.path.insert(0, str(Path(__file__).parent.parent)) + import tensorflow_transform as tft + assert tft.__version__ + + def test_coverage_configuration(self): + """Verify coverage is properly configured.""" + project_root = Path(__file__).parent.parent + pyproject_path = project_root / "pyproject.toml" + + with open(pyproject_path, "r") as f: + content = f.read() + + # Check coverage configuration exists + assert "[tool.coverage.run]" in content + assert "[tool.coverage.report]" in content + assert "source = [\"tensorflow_transform\"]" in content + assert "--cov-fail-under=80" in content + + def test_pytest_configuration(self): + """Verify pytest is properly configured.""" + project_root = Path(__file__).parent.parent + pyproject_path = project_root / "pyproject.toml" + + with open(pyproject_path, "r") as f: + content = f.read() + + # Check pytest configuration exists + assert "[tool.pytest.ini_options]" in content + assert "testpaths = [\"tests\"]" in content + assert "--strict-markers" in content + + def test_poetry_scripts_configured(self): + """Verify Poetry scripts are properly configured.""" + project_root = Path(__file__).parent.parent + pyproject_path = project_root / "pyproject.toml" + + with open(pyproject_path, "r") as f: + content = f.read() + + # Check poetry scripts exist + assert "[tool.poetry.scripts]" in content + assert 'test = "pytest:main"' in content + assert 'tests = "pytest:main"' in content + + +class TestMockingCapabilities: + """Test class to validate mocking capabilities.""" + + def test_pytest_mock_fixture(self, mocker): + """Test that pytest-mock mocker fixture works.""" + mock_func = mocker.Mock(return_value="mocked") + assert mock_func() == "mocked" + mock_func.assert_called_once() + + def test_mock_patch(self, mocker): + """Test that patching with mocker works.""" + mock_os = mocker.patch("os.path.exists") + mock_os.return_value = True + + result = os.path.exists("/fake/path") + assert result is True + mock_os.assert_called_with("/fake/path") + + +class TestTempFileHandling: + """Test class to validate temporary file handling.""" + + def test_temp_dir_cleanup(self, temp_dir): + """Test that temp_dir is created and will be cleaned up.""" + test_file = temp_dir / "test.txt" + test_file.write_text("test content") + + assert test_file.exists() + assert test_file.read_text() == "test content" + + def test_temp_file_cleanup(self, temp_file): + """Test that temp_file is created and will be cleaned up.""" + assert temp_file.exists() + temp_file.write_text("temporary content") + assert temp_file.read_text() == "temporary content" + + +if __name__ == "__main__": + # Run tests when script is executed directly + pytest.main([__file__, "-v"]) \ No newline at end of file diff --git a/tests/unit/__init__.py b/tests/unit/__init__.py new file mode 100644 index 00000000..e69de29b