From a69950194108c2669e72090d813f0eac22aa1be7 Mon Sep 17 00:00:00 2001
From: llbbl <logan@llbbl.com>
Date: Sun, 15 Jun 2025 11:49:59 -0500
Subject: [PATCH] feat: Set up comprehensive Python testing infrastructure with
 Poetry

- Migrate from setup.py to Poetry package manager in pyproject.toml
- Add pytest, pytest-cov, and pytest-mock as dev dependencies
- Configure pytest with custom markers (unit, integration, slow)
- Set up coverage reporting with 80% threshold and multiple formats
- Create tests/ directory structure with unit/ and integration/ subdirs
- Add comprehensive shared fixtures in conftest.py
- Update .gitignore with testing and development artifacts
- Create validation tests to verify infrastructure setup
- Configure Poetry scripts for 'test' and 'tests' commands
- Document testing setup and known ARM64 compatibility issues
---
 .gitignore                     |  45 +++++++
 pyproject.toml                 | 124 ++++++++++++++++++
 tests/README.md                |  94 +++++++++++++
 tests/__init__.py              |   0
 tests/conftest.py              | 233 +++++++++++++++++++++++++++++++++
 tests/integration/__init__.py  |   0
 tests/test_minimal_setup.py    |  37 ++++++
 tests/test_setup_validation.py | 172 ++++++++++++++++++++++++
 tests/unit/__init__.py         |   0
 9 files changed, 705 insertions(+)
 create mode 100644 pyproject.toml
 create mode 100644 tests/README.md
 create mode 100644 tests/__init__.py
 create mode 100644 tests/conftest.py
 create mode 100644 tests/integration/__init__.py
 create mode 100644 tests/test_minimal_setup.py
 create mode 100644 tests/test_setup_validation.py
 create mode 100644 tests/unit/__init__.py

diff --git a/.gitignore b/.gitignore
index 794312a6..a35684be 100644
--- a/.gitignore
+++ b/.gitignore
@@ -14,3 +14,48 @@ AUTHORS
 ChangeLog
 .DS_Store
 .mypy_cache
+
+# Testing artifacts
+.pytest_cache/
+.coverage
+htmlcov/
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+pytest_cache/
+
+# Claude settings
+.claude/*
+
+# Poetry
+# Note: Do not ignore poetry.lock - it should be committed
+
+# Virtual environments
+venv/
+ENV/
+env/
+.venv/
+.env
+
+# IDE specific files
+.vscode/
+*.sublime-project
+*.sublime-workspace
+
+# Build artifacts
+__pycache__/
+*.so
+*.dylib
+*.dll
+
+# OS specific
+Thumbs.db
+.DS_Store
+Desktop.ini
+
+# Temporary files
+*.tmp
+*.temp
+*.log
+.cache/
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 00000000..24cfa076
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,124 @@
+[tool.poetry]
+name = "tensorflow-transform"
+version = "1.17.1"
+description = "A library for data preprocessing with TensorFlow"
+authors = ["Google Inc. <tensorflow-extended-dev@googlegroups.com>"]
+license = "Apache-2.0"
+readme = "README.md"
+homepage = "https://www.tensorflow.org/tfx/transform/get_started"
+repository = "https://github.com/tensorflow/transform"
+keywords = ["tensorflow", "transform", "tfx"]
+classifiers = [
+    "Development Status :: 5 - Production/Stable",
+    "Intended Audience :: Developers",
+    "Intended Audience :: Education",
+    "Intended Audience :: Science/Research",
+    "License :: OSI Approved :: Apache Software License",
+    "Operating System :: OS Independent",
+    "Programming Language :: Python",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.9",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3 :: Only",
+    "Topic :: Scientific/Engineering",
+    "Topic :: Scientific/Engineering :: Artificial Intelligence",
+    "Topic :: Scientific/Engineering :: Mathematics",
+    "Topic :: Software Development",
+    "Topic :: Software Development :: Libraries",
+    "Topic :: Software Development :: Libraries :: Python Modules",
+]
+packages = [{include = "tensorflow_transform"}]
+
+[tool.poetry.dependencies]
+python = ">=3.9,<4"
+absl-py = ">=0.9,<2.0.0"
+apache-beam = {version = ">=2.53,<3", extras = ["gcp"], python = ">=3.11"}
+numpy = ">=1.22.0"
+protobuf = [
+    {version = ">=4.25.2,<6.0.0", python = ">=3.11"},
+    {version = ">=4.21.6,<6.0.0", python = "<3.11"}
+]
+pyarrow = ">=10,<11"
+pydot = ">=1.2,<2"
+tensorflow = ">=2.17,<2.18"
+tensorflow-metadata = ">=1.15.0"
+tf-keras = ">=2"
+tfx-bsl = ">=1.15.0"
+
+[tool.poetry.group.dev.dependencies]
+pytest = "^8.0.0"
+pytest-cov = "^4.1.0"
+pytest-mock = "^3.12.0"
+
+[tool.poetry.scripts]
+test = "pytest:main"
+tests = "pytest:main"
+
+[tool.pytest.ini_options]
+minversion = "8.0"
+addopts = [
+    "--strict-markers",
+    "--tb=short",
+    "--cov=tensorflow_transform",
+    "--cov-report=term-missing:skip-covered",
+    "--cov-report=html",
+    "--cov-report=xml",
+    "--cov-fail-under=80",
+    "--doctest-modules",
+    "-v"
+]
+testpaths = ["tests"]
+python_files = ["test_*.py", "*_test.py"]
+python_classes = ["Test*"]
+python_functions = ["test_*"]
+markers = [
+    "unit: marks tests as unit tests (fast, isolated)",
+    "integration: marks tests as integration tests (may require external resources)",
+    "slow: marks tests as slow (deselect with '-m \"not slow\"')"
+]
+filterwarnings = [
+    "ignore::DeprecationWarning",
+    "ignore::PendingDeprecationWarning"
+]
+
+[tool.coverage.run]
+source = ["tensorflow_transform"]
+branch = true
+parallel = true
+omit = [
+    "*/tests/*",
+    "*/test_*.py",
+    "*/*_test.py",
+    "*/setup.py",
+    "*/version.py",
+    "*/__pycache__/*",
+    "*/site-packages/*"
+]
+
+[tool.coverage.report]
+exclude_lines = [
+    "pragma: no cover",
+    "def __repr__",
+    "def __str__",
+    "raise AssertionError",
+    "raise NotImplementedError",
+    "if __name__ == .__main__.:",
+    "if TYPE_CHECKING:",
+    "if typing.TYPE_CHECKING:",
+    "@abstractmethod",
+    "@abc.abstractmethod"
+]
+precision = 2
+skip_covered = true
+show_missing = true
+
+[tool.coverage.html]
+directory = "htmlcov"
+
+[tool.coverage.xml]
+output = "coverage.xml"
+
+[build-system]
+requires = ["poetry-core>=1.0.0"]
+build-backend = "poetry.core.masonry.api"
\ No newline at end of file
diff --git a/tests/README.md b/tests/README.md
new file mode 100644
index 00000000..044b9ad1
--- /dev/null
+++ b/tests/README.md
@@ -0,0 +1,94 @@
+# TensorFlow Transform Testing Infrastructure
+
+This directory contains the testing infrastructure for TensorFlow Transform.
+
+## Structure
+
+```
+tests/
+├── README.md               # This file
+├── __init__.py            # Package marker
+├── conftest.py            # Shared pytest fixtures and configuration
+├── test_setup_validation.py  # Full validation tests (requires all dependencies)
+├── test_minimal_setup.py     # Minimal tests that work without all dependencies
+├── unit/                  # Unit tests directory
+│   └── __init__.py
+└── integration/           # Integration tests directory
+    └── __init__.py
+```
+
+## Running Tests
+
+### Using Poetry Scripts
+
+```bash
+# Run all tests
+poetry run test
+
+# Alternative command (both work)
+poetry run tests
+
+# Run specific test file
+poetry run test tests/test_minimal_setup.py
+
+# Run with specific markers
+poetry run test -m unit
+poetry run test -m "not slow"
+```
+
+### Using pytest directly
+
+```bash
+# Run all tests
+python -m pytest
+
+# Run with coverage
+python -m pytest --cov=tensorflow_transform
+
+# Run without coverage (useful for debugging)
+python -m pytest --no-cov
+```
+
+## Test Markers
+
+- `@pytest.mark.unit` - Fast, isolated unit tests
+- `@pytest.mark.integration` - Integration tests that may require external resources
+- `@pytest.mark.slow` - Tests that take a long time to run
+
+## Available Fixtures
+
+See `conftest.py` for all available fixtures. Key fixtures include:
+
+- `temp_dir` - Temporary directory that's cleaned up after test
+- `temp_file` - Temporary file that's cleaned up after test
+- `mock_config` - Sample configuration dictionary
+- `sample_data` - Sample data for testing transformations
+- `tf_example_data` - Temporary TFRecord file with example data
+- `mock_preprocessing_fn` - Simple preprocessing function for testing
+- `mock_schema` - Simple schema for testing
+
+## Coverage Configuration
+
+Coverage is configured to:
+- Require 80% minimum coverage
+- Generate HTML reports in `htmlcov/`
+- Generate XML report as `coverage.xml`
+- Exclude test files and common patterns from coverage
+
+## Known Issues
+
+### ARM64 Architecture Support
+
+Some dependencies like `tfx-bsl` may not have pre-built wheels for ARM64 architecture (e.g., Apple Silicon Macs, ARM Linux). If you encounter installation issues:
+
+1. Try running the minimal test suite: `poetry run test tests/test_minimal_setup.py --no-cov`
+2. Consider using x86_64 emulation or a compatible environment
+3. Build dependencies from source if needed
+
+## Writing New Tests
+
+1. Place unit tests in `tests/unit/`
+2. Place integration tests in `tests/integration/`
+3. Use appropriate markers (`@pytest.mark.unit`, etc.)
+4. Import and use fixtures from `conftest.py`
+5. Follow existing test patterns and naming conventions
\ No newline at end of file
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/conftest.py b/tests/conftest.py
new file mode 100644
index 00000000..e0ca1623
--- /dev/null
+++ b/tests/conftest.py
@@ -0,0 +1,233 @@
+"""Pytest configuration and shared fixtures for TensorFlow Transform tests."""
+
+import os
+import shutil
+import tempfile
+from pathlib import Path
+from typing import Generator, Any, Dict
+
+import pytest
+import tensorflow as tf
+
+
+@pytest.fixture
+def temp_dir() -> Generator[Path, None, None]:
+    """Create a temporary directory for test files.
+    
+    Yields:
+        Path: Path to the temporary directory that will be cleaned up after test.
+    """
+    temp_path = tempfile.mkdtemp()
+    yield Path(temp_path)
+    shutil.rmtree(temp_path, ignore_errors=True)
+
+
+@pytest.fixture
+def temp_file() -> Generator[Path, None, None]:
+    """Create a temporary file for testing.
+    
+    Yields:
+        Path: Path to the temporary file that will be cleaned up after test.
+    """
+    fd, temp_path = tempfile.mkstemp()
+    os.close(fd)
+    yield Path(temp_path)
+    if os.path.exists(temp_path):
+        os.unlink(temp_path)
+
+
+@pytest.fixture
+def mock_config() -> Dict[str, Any]:
+    """Provide a mock configuration dictionary for testing.
+    
+    Returns:
+        Dict[str, Any]: A sample configuration dictionary.
+    """
+    return {
+        "batch_size": 32,
+        "learning_rate": 0.001,
+        "epochs": 10,
+        "features": ["feature1", "feature2", "feature3"],
+        "label": "target",
+        "model_dir": "/tmp/model",
+        "preprocessing": {
+            "normalize": True,
+            "scale": True,
+            "bucketize": False
+        }
+    }
+
+
+@pytest.fixture
+def sample_data() -> Dict[str, Any]:
+    """Generate sample data for testing transformations.
+    
+    Returns:
+        Dict[str, Any]: A dictionary containing sample features and labels.
+    """
+    return {
+        "numeric_feature": [1.0, 2.0, 3.0, 4.0, 5.0],
+        "categorical_feature": ["A", "B", "A", "C", "B"],
+        "text_feature": ["hello world", "tensorflow transform", "test data"],
+        "label": [0, 1, 0, 1, 1]
+    }
+
+
+@pytest.fixture
+def tf_example_data() -> Generator[str, None, None]:
+    """Create a temporary TFRecord file with example data.
+    
+    Yields:
+        str: Path to the temporary TFRecord file.
+    """
+    import tensorflow as tf
+    
+    temp_path = tempfile.mktemp(suffix=".tfrecord")
+    
+    # Create sample TF examples
+    writer = tf.io.TFRecordWriter(temp_path)
+    
+    for i in range(5):
+        example = tf.train.Example(
+            features=tf.train.Features(
+                feature={
+                    "numeric_feature": tf.train.Feature(
+                        float_list=tf.train.FloatList(value=[float(i)])
+                    ),
+                    "categorical_feature": tf.train.Feature(
+                        bytes_list=tf.train.BytesList(value=[f"category_{i}".encode()])
+                    ),
+                    "label": tf.train.Feature(
+                        int64_list=tf.train.Int64List(value=[i % 2])
+                    )
+                }
+            )
+        )
+        writer.write(example.SerializeToString())
+    
+    writer.close()
+    
+    yield temp_path
+    
+    if os.path.exists(temp_path):
+        os.unlink(temp_path)
+
+
+@pytest.fixture
+def mock_transform_output_dir(temp_dir: Path) -> Path:
+    """Create a mock transform output directory structure.
+    
+    Args:
+        temp_dir: Temporary directory fixture.
+        
+    Returns:
+        Path: Path to the transform output directory.
+    """
+    output_dir = temp_dir / "transform_output"
+    output_dir.mkdir()
+    
+    # Create expected subdirectories
+    (output_dir / "transformed_metadata").mkdir()
+    (output_dir / "transform_fn").mkdir()
+    (output_dir / "transformed_data").mkdir()
+    
+    return output_dir
+
+
+@pytest.fixture(autouse=True)
+def reset_tensorflow_state():
+    """Reset TensorFlow state between tests to avoid interference."""
+    yield
+    tf.keras.backend.clear_session()
+
+
+@pytest.fixture
+def mock_preprocessing_fn():
+    """Provide a simple preprocessing function for testing.
+    
+    Returns:
+        callable: A preprocessing function that applies basic transformations.
+    """
+    def preprocessing_fn(inputs):
+        """Simple preprocessing function for testing."""
+        import tensorflow_transform as tft
+        
+        outputs = {}
+        
+        # Normalize numeric features
+        if "numeric_feature" in inputs:
+            outputs["numeric_feature_normalized"] = tft.scale_to_z_score(
+                inputs["numeric_feature"]
+            )
+        
+        # Vocabulary for categorical features
+        if "categorical_feature" in inputs:
+            outputs["categorical_feature_integerized"] = tft.compute_and_apply_vocabulary(
+                inputs["categorical_feature"]
+            )
+        
+        # Pass through labels
+        if "label" in inputs:
+            outputs["label"] = inputs["label"]
+        
+        return outputs
+    
+    return preprocessing_fn
+
+
+@pytest.fixture
+def mock_schema():
+    """Provide a mock schema for testing.
+    
+    Returns:
+        schema_pb2.Schema: A simple schema for testing.
+    """
+    from tensorflow_metadata.proto.v0 import schema_pb2
+    
+    schema = schema_pb2.Schema()
+    
+    # Add numeric feature
+    numeric_feature = schema.feature.add()
+    numeric_feature.name = "numeric_feature"
+    numeric_feature.type = schema_pb2.FLOAT
+    
+    # Add categorical feature
+    categorical_feature = schema.feature.add()
+    categorical_feature.name = "categorical_feature"
+    categorical_feature.type = schema_pb2.BYTES
+    
+    # Add label
+    label_feature = schema.feature.add()
+    label_feature.name = "label"
+    label_feature.type = schema_pb2.INT
+    
+    return schema
+
+
+# Markers for test organization
+def pytest_configure(config):
+    """Configure custom pytest markers."""
+    config.addinivalue_line(
+        "markers", "unit: mark test as a unit test (fast, isolated)"
+    )
+    config.addinivalue_line(
+        "markers", "integration: mark test as an integration test (may require external resources)"
+    )
+    config.addinivalue_line(
+        "markers", "slow: mark test as slow (deselect with '-m \"not slow\"')"
+    )
+
+
+# Test collection hooks
+def pytest_collection_modifyitems(config, items):
+    """Modify test collection to add markers based on test location."""
+    for item in items:
+        # Add markers based on test file location
+        if "unit" in str(item.fspath):
+            item.add_marker(pytest.mark.unit)
+        elif "integration" in str(item.fspath):
+            item.add_marker(pytest.mark.integration)
+        
+        # Add slow marker for tests with "slow" in their name
+        if "slow" in item.name.lower():
+            item.add_marker(pytest.mark.slow)
\ No newline at end of file
diff --git a/tests/integration/__init__.py b/tests/integration/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/test_minimal_setup.py b/tests/test_minimal_setup.py
new file mode 100644
index 00000000..574da859
--- /dev/null
+++ b/tests/test_minimal_setup.py
@@ -0,0 +1,37 @@
+"""Minimal validation tests that can run without all dependencies."""
+
+import os
+import sys
+from pathlib import Path
+
+import pytest
+
+
+class TestMinimalSetup:
+    """Minimal tests to validate basic setup."""
+    
+    def test_pytest_works(self):
+        """Basic test to ensure pytest is functional."""
+        assert 1 + 1 == 2
+    
+    def test_project_structure(self):
+        """Test that basic project structure is in place."""
+        root = Path(__file__).parent.parent
+        assert (root / "tests").exists()
+        assert (root / "pyproject.toml").exists()
+        assert (root / ".gitignore").exists()
+    
+    def test_fixtures_work(self, tmp_path):
+        """Test that basic pytest fixtures work."""
+        test_file = tmp_path / "test.txt"
+        test_file.write_text("hello")
+        assert test_file.read_text() == "hello"
+    
+    @pytest.mark.unit
+    def test_markers_work(self):
+        """Test that custom markers are functional."""
+        assert True
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])
\ No newline at end of file
diff --git a/tests/test_setup_validation.py b/tests/test_setup_validation.py
new file mode 100644
index 00000000..57e7fb8b
--- /dev/null
+++ b/tests/test_setup_validation.py
@@ -0,0 +1,172 @@
+"""Validation tests to verify the testing infrastructure is properly set up."""
+
+import os
+import sys
+from pathlib import Path
+
+import pytest
+
+
+class TestInfrastructureSetup:
+    """Test class to validate the testing infrastructure setup."""
+    
+    def test_pytest_is_installed(self):
+        """Verify pytest is installed and importable."""
+        import pytest
+        assert pytest.__version__
+    
+    def test_pytest_cov_is_installed(self):
+        """Verify pytest-cov is installed and importable."""
+        import pytest_cov
+        assert pytest_cov
+    
+    def test_pytest_mock_is_installed(self):
+        """Verify pytest-mock is installed and importable."""
+        import pytest_mock
+        assert pytest_mock
+    
+    def test_project_structure_exists(self):
+        """Verify the expected project structure exists."""
+        project_root = Path(__file__).parent.parent
+        
+        # Check main directories
+        assert project_root.exists()
+        assert (project_root / "tensorflow_transform").exists()
+        assert (project_root / "tests").exists()
+        assert (project_root / "tests" / "unit").exists()
+        assert (project_root / "tests" / "integration").exists()
+        
+        # Check configuration files
+        assert (project_root / "pyproject.toml").exists()
+        assert (project_root / ".gitignore").exists()
+    
+    def test_conftest_fixtures_available(self, temp_dir, mock_config, sample_data):
+        """Verify conftest fixtures are available and working."""
+        # Test temp_dir fixture
+        assert temp_dir.exists()
+        assert temp_dir.is_dir()
+        
+        # Test mock_config fixture
+        assert isinstance(mock_config, dict)
+        assert "batch_size" in mock_config
+        assert "features" in mock_config
+        
+        # Test sample_data fixture
+        assert isinstance(sample_data, dict)
+        assert "numeric_feature" in sample_data
+        assert "categorical_feature" in sample_data
+    
+    def test_markers_are_registered(self):
+        """Verify custom markers are properly registered."""
+        markers = [mark.name for mark in pytest.mark._markers]
+        assert "unit" in markers
+        assert "integration" in markers
+        assert "slow" in markers
+    
+    @pytest.mark.unit
+    def test_unit_marker_works(self):
+        """Test that unit marker can be applied."""
+        assert True
+    
+    @pytest.mark.integration
+    def test_integration_marker_works(self):
+        """Test that integration marker can be applied."""
+        assert True
+    
+    @pytest.mark.slow
+    def test_slow_marker_works(self):
+        """Test that slow marker can be applied."""
+        assert True
+    
+    def test_tensorflow_import(self):
+        """Verify TensorFlow can be imported."""
+        import tensorflow as tf
+        assert tf.__version__
+    
+    def test_tensorflow_transform_import(self):
+        """Verify tensorflow_transform can be imported."""
+        sys.path.insert(0, str(Path(__file__).parent.parent))
+        import tensorflow_transform as tft
+        assert tft.__version__
+    
+    def test_coverage_configuration(self):
+        """Verify coverage is properly configured."""
+        project_root = Path(__file__).parent.parent
+        pyproject_path = project_root / "pyproject.toml"
+        
+        with open(pyproject_path, "r") as f:
+            content = f.read()
+            
+        # Check coverage configuration exists
+        assert "[tool.coverage.run]" in content
+        assert "[tool.coverage.report]" in content
+        assert "source = [\"tensorflow_transform\"]" in content
+        assert "--cov-fail-under=80" in content
+    
+    def test_pytest_configuration(self):
+        """Verify pytest is properly configured."""
+        project_root = Path(__file__).parent.parent
+        pyproject_path = project_root / "pyproject.toml"
+        
+        with open(pyproject_path, "r") as f:
+            content = f.read()
+            
+        # Check pytest configuration exists
+        assert "[tool.pytest.ini_options]" in content
+        assert "testpaths = [\"tests\"]" in content
+        assert "--strict-markers" in content
+    
+    def test_poetry_scripts_configured(self):
+        """Verify Poetry scripts are properly configured."""
+        project_root = Path(__file__).parent.parent
+        pyproject_path = project_root / "pyproject.toml"
+        
+        with open(pyproject_path, "r") as f:
+            content = f.read()
+            
+        # Check poetry scripts exist
+        assert "[tool.poetry.scripts]" in content
+        assert 'test = "pytest:main"' in content
+        assert 'tests = "pytest:main"' in content
+
+
+class TestMockingCapabilities:
+    """Test class to validate mocking capabilities."""
+    
+    def test_pytest_mock_fixture(self, mocker):
+        """Test that pytest-mock mocker fixture works."""
+        mock_func = mocker.Mock(return_value="mocked")
+        assert mock_func() == "mocked"
+        mock_func.assert_called_once()
+    
+    def test_mock_patch(self, mocker):
+        """Test that patching with mocker works."""
+        mock_os = mocker.patch("os.path.exists")
+        mock_os.return_value = True
+        
+        result = os.path.exists("/fake/path")
+        assert result is True
+        mock_os.assert_called_with("/fake/path")
+
+
+class TestTempFileHandling:
+    """Test class to validate temporary file handling."""
+    
+    def test_temp_dir_cleanup(self, temp_dir):
+        """Test that temp_dir is created and will be cleaned up."""
+        test_file = temp_dir / "test.txt"
+        test_file.write_text("test content")
+        
+        assert test_file.exists()
+        assert test_file.read_text() == "test content"
+    
+    def test_temp_file_cleanup(self, temp_file):
+        """Test that temp_file is created and will be cleaned up."""
+        assert temp_file.exists()
+        temp_file.write_text("temporary content")
+        assert temp_file.read_text() == "temporary content"
+
+
+if __name__ == "__main__":
+    # Run tests when script is executed directly
+    pytest.main([__file__, "-v"])
\ No newline at end of file
diff --git a/tests/unit/__init__.py b/tests/unit/__init__.py
new file mode 100644
index 00000000..e69de29b