Skip to content

Commit

Permalink
Label PRs when the json schema changes (#2240)
Browse files Browse the repository at this point in the history
* label PRs when the json schema changes

Signed-off-by: Alex Goodman <wagoodman@users.noreply.github.com>

* moderate pr comments

Signed-off-by: Alex Goodman <wagoodman@users.noreply.github.com>

* be more strict about processing file names

Signed-off-by: Alex Goodman <wagoodman@users.noreply.github.com>

---------

Signed-off-by: Alex Goodman <wagoodman@users.noreply.github.com>
  • Loading branch information
wagoodman authored Oct 20, 2023
1 parent ef43294 commit 8f6bdde
Show file tree
Hide file tree
Showing 6 changed files with 354 additions and 1 deletion.
224 changes: 224 additions & 0 deletions .github/scripts/labeler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,224 @@
from __future__ import annotations

import sys
import glob
import subprocess
import os
import re

DRY_RUN = False


def main(changed_files: str | None = None, merge_base_schema_files: str | None = None):
global DRY_RUN

pr_number = os.environ.get("GITHUB_PR_NUMBER")
comment_file_path = os.environ.get("CI_COMMENT_FILE")

if not comment_file_path:
print("CI_COMMENT_FILE not set")
sys.exit(1)

if not pr_number:
DRY_RUN = True

if changed_files:
DRY_RUN = True

# read lines from file... this is useful for local testing
with open(changed_files) as f:
pr_changed_files = f.read().splitlines()

with open(merge_base_schema_files) as f:
og_json_schema_files = sort_json_schema_files(f.read().splitlines())

else:
if not is_ci():
print("Not in CI")
sys.exit(1)

if not pr_number:
print("Not a PR")
sys.exit(1)

pr_changed_files = get_pr_changed_files(pr_number)
# since we are running this in the context of the pull_request_target, the checkout is the merge base..
# that is the main branch of the original repo, NOT the branch in the forked repo (or branch in the target
# repo for non-forked PRs). This means we just need to list the current checkedout files to get a sense of
# the changes before a merge.
og_json_schema_files = list_json_schema_files()

pr_json_schema_files = filter_to_schema_files(pr_changed_files)

# print("schema files in pr: ", summarize_schema_files(pr_json_schema_files))
# print("og schema files: ", summarize_schema_files(og_json_schema_files))

if not og_json_schema_files:
print("No schema files found in merge base")
sys.exit(1)

# pr_json_schema_files = set of PR files are added, removed, and changed files
new_schema_files = set(pr_json_schema_files) - set(og_json_schema_files)
removed_or_modified_schema_files = set(pr_json_schema_files) - set(new_schema_files)

print("new schemas: ", summarize_schema_files(new_schema_files))
print("removed or modified schemas:", summarize_schema_files(removed_or_modified_schema_files))

# if there is a new or modified schema, we should add the "json-schema" label to the PR...
if new_schema_files or removed_or_modified_schema_files:
print("\nAdding json-schema label...")
add_label(pr_number, "json-schema")
else:
remove_label(pr_number, "json-schema")

# new schema files should be scrutinized, comparing the latest and added versions to see if it's a breaking
# change (major version bump). Warn about it on the PR via adding a breaking-change label...
if is_breaking_change(new_schema_files, og_json_schema_files[-1]):
print("\nBreaking change detected...")
add_label(pr_number, "breaking-change")
else:
remove_label(pr_number, "breaking-change")

# modifying an existing schema could be a breaking change, we should warn about it on the PR via a comment...
# removing schema files should never be allowed, we should warn about it on the PR via a comment...
if removed_or_modified_schema_files:
print("\nRemoved or modified schema detected...")
schemas = sort_json_schema_files(list(removed_or_modified_schema_files))
schemas_str = "\n".join([f" - {schema}" for schema in schemas])
add_comment(comment_file_path, f"Detected modification or removal of existing json schemas:\n{schemas_str}", warning=True)


def add_comment(comment_file_path: str, comment: str, warning: bool = False, important: bool = False):
if warning or important:
comment_lines = comment.splitlines()
comment = "\n".join([f"> {line}" for line in comment_lines])

if warning:
comment = f"> [!WARNING]\n{comment}"
elif important:
comment = f"> [!IMPORTANT]\n{comment}"

# create any parent directories if they don't exist
os.makedirs(os.path.dirname(comment_file_path), exist_ok=True)

with open(comment_file_path, "w") as f:
f.write(comment)

print(f"Comment file contents: {comment_file_path}")
print(comment)


def add_label(pr_number: str, label: str):
# run "gh pr edit --add-label <label>"
result = run(f"gh pr edit {pr_number} --add-label {label}", shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
if result.returncode != 0:
print(f"Unable to add {label!r} label to PR with")
print(str(result.stderr))
sys.exit(1)


def remove_label(pr_number: str, label: str):
# run "gh pr edit --remove-label <label>"
result = run(f"gh pr edit {pr_number} --remove-label {label}", shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
if result.returncode != 0:
print(f"Unable to label PR with {label!r}")
print(str(result.stderr))
sys.exit(1)


def major_version(semver: str) -> int:
return int(semver.split(".")[0])


def is_breaking_change(new_schema_files: set[str], latest_schema_file: str) -> bool:
latest_major_version = major_version(get_semver(latest_schema_file))
for file in new_schema_files:
change_major_version = major_version(get_semver(file))
if change_major_version > latest_major_version:
return True
return False


def summarize_schema_files(files: list[str]) -> list[str]:
return [get_semver(file) for file in files]


def is_ci() -> bool:
return "CI" in os.environ


def get_pr_changed_files(pr_number: str) -> list[str]:
result = run(f"gh pr view {pr_number} --json files --jq '.files.[].path'", shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
if result.returncode != 0:
print("Unable to get list of changed files in PR")
print(str(result.stderr))
sys.exit(1)

list_of_files = result.stdout.splitlines()
return list_of_files


def filter_to_schema_files(list_of_files: list[str]) -> list[str]:
# get files matching "schema/json/schema-*.json"
files = []
for file in list_of_files:
if re.match(r"^schema/json/schema-\d+\.\d+\.\d+\.json$", file):
files.append(file)
return sort_json_schema_files(files)


def list_json_schema_files() -> list[str]:
# list files in "schema/json" directory matching the pattern of "schema-*.json"
return sort_json_schema_files(list(glob.glob("schema/json/schema-*.json")))


def run(command: str, **kwargs) -> subprocess.CompletedProcess:
if DRY_RUN:
print(f"[DRY RUN] {command}")
return subprocess.CompletedProcess(args=[command], returncode=0)
print(f"[RUN] {command}")
return subprocess.run(command, **kwargs)


def get_semver(input_file: str) -> str:
return input_file.split("-")[1].split(".json")[0]


def sort_json_schema_files(files: list[str]) -> list[str]:
# sort files by schema version, where the input looks like "schema/json/schema-1.12.1.json"
# we should sort by the semantic version embedded within the basename, not the string
# so that "schema/json/schema-1.2.1.json" comes before "schema/json/schema-1.12.1.json".
versions = [get_semver(file) for file in files if file]

versions = sorted(versions, key=lambda s: [int(u) for u in s.split('.')])

return [f"schema/json/schema-{version}.json" for version in versions]


# allow for test files that have line-by-line list of files:

# .binny.yaml
# .github/actions/bootstrap/action.yaml
# .github/scripts/goreleaser-install.sh
# .github/workflows/release.yaml
# .github/workflows/update-bootstrap-tools.yml
# .github/workflows/update-cpe-dictionary-index.yml
# .github/workflows/update-stereoscope-release.yml
# .github/workflows/validations.yaml
# .gitignore
# .goreleaser.yaml
# Makefile
# Taskfile.yaml
# schema/cyclonedx/Makefile

if __name__ == "__main__":
# these are variables for a single file name that contains a list of files (line separated)
changed_files = None
merge_base_schema_files = None

if len(sys.argv) > 2:
changed_files = sys.argv[1]
merge_base_schema_files = sys.argv[2]

main(changed_files, merge_base_schema_files)

65 changes: 65 additions & 0 deletions .github/scripts/labeler_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import unittest
from unittest.mock import patch
import subprocess

import labeler

class Labeler(unittest.TestCase):

def test_major_version(self):
self.assertEqual(labeler.major_version("1.2.3"), 1)
self.assertEqual(labeler.major_version("2.0.0"), 2)

def test_is_breaking_change(self):
new_schema_files = ["schema/json/schema-2.0.0.json"]
latest_schema_file = "schema/json/schema-1.2.0.json"
self.assertTrue(labeler.is_breaking_change(new_schema_files, latest_schema_file))

new_schema_files = ["schema/json/schema-1.3.0.json"]
latest_schema_file = "schema/json/schema-1.2.0.json"
self.assertFalse(labeler.is_breaking_change(new_schema_files, latest_schema_file))

def test_summarize_schema_files(self):
files = ["schema/json/schema-1.0.0.json", "schema/json/schema-2.0.0.json"]
expected = ["1.0.0", "2.0.0"]
self.assertEqual(labeler.summarize_schema_files(files), expected)

def test_is_ci(self):
# Mock os.environ to simulate CI environment
with patch.dict("os.environ", {"CI": "true"}):
self.assertTrue(labeler.is_ci())

def test_get_pr_changed_files(self):
expected_command = "gh pr view 123 --json files --jq '.files.[].path'"
expected_output = "file1.json\nfile2.json\n"

subprocess.CompletedProcess.returncode = 0
subprocess.CompletedProcess.stdout = expected_output
with patch("labeler.run", return_value=subprocess.CompletedProcess) as mock_run:
result = labeler.get_pr_changed_files("123")
mock_run.assert_called_with(expected_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
self.assertEqual(result, ["file1.json", "file2.json"])

def test_filter_to_schema_files(self):
input_files = ["schema/json/schema-1.0.0.json", "not_schema.txt", "schema/json/schema-2.0.0.json"]
expected_files = ["schema/json/schema-1.0.0.json", "schema/json/schema-2.0.0.json"]
self.assertEqual(labeler.filter_to_schema_files(input_files), expected_files)

# we should be strict about what files are allowed to be processed
input_files = ["schema/json/schema-1.0.0extracontent.json", "schema/json/schema-1.0.0.md", "schema/json/schema-1.0.0.json.extracontent"]
expected_files = []
self.assertEqual(labeler.filter_to_schema_files(input_files), expected_files)

def test_get_semver(self):
input_file = "schema/json/schema-1.0.0.json"
expected_semver = "1.0.0"
self.assertEqual(labeler.get_semver(input_file), expected_semver)

def test_sort_json_schema_files(self):
files = ["schema/json/schema-1.12.1.json", "schema/json/schema-1.2.1.json"]
expected_sorted_files = ["schema/json/schema-1.2.1.json", "schema/json/schema-1.12.1.json"]
self.assertEqual(labeler.sort_json_schema_files(files), expected_sorted_files)


if __name__ == "__main__":
unittest.main()
54 changes: 54 additions & 0 deletions .github/workflows/labeler.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
name: "Detect schema changes"

on:
# IMPORTANT! This workflow is triggered by the `pull_request_target` event
# which means that forked PRs will run with access secrets from the repo
# it's forked from (the "target" repo).
#
# For this reason we only NEVER checkout the code from the pull request
# (e.g. "ref: ${{ github.event.pull_request.head.sha }}") to prevent
# accidentally running potentially untrusted code.
#
# By default the checkout will be:
# - GITHUB_SHA: Last commit on the PR base branch
# - GITHUB_REF: PR base branch
#
# ...unlike a typical PR where:
# - GITHUB_SHA: Last merge commit on the GITHUB_REF branch
# - GITHUB_REF: PR merge branch refs/pull/:prNumber/merge
pull_request_target:

env:
# note: this is used within hashFiles() so must be within the GITHUB_WORKSPACE path (or will silently fail)
CI_COMMENT_FILE: .tmp/labeler-comment.txt
# needs to be any string to uniquely identify the comment on a PR across multiple runs
COMMENT_HEADER: "label-commentary"

jobs:
label:
name: "Label changes"
runs-on: ubuntu-22.04
steps:

- uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 #v4.1.1

- run: python .github/scripts/labeler.py
env:
# note: this token has write access to the repo
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
GITHUB_PR_NUMBER: ${{ github.event.number }}

- name: Delete existing comment
if: ${{ hashFiles( env.CI_COMMENT_FILE ) == '' }}
uses: marocchino/sticky-pull-request-comment@efaaab3fd41a9c3de579aba759d2552635e590fd #v2.8.0
with:
header: ${{ env.COMMENT_HEADER }}
hide: true
hide_classify: "OUTDATED"

- name: Add comment
if: ${{ hashFiles( env.CI_COMMENT_FILE ) != '' }}
uses: marocchino/sticky-pull-request-comment@efaaab3fd41a9c3de579aba759d2552635e590fd #v2.8.0
with:
header: ${{ env.COMMENT_HEADER }}
path: ${{ env.CI_COMMENT_FILE }}
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -64,3 +64,8 @@ test/integration/test-fixtures/**/go.sum
# attestation
cosign.key
cosign.pub

# Byte-compiled object files for python
__pycache__/
*.py[cod]
*$py.class
1 change: 1 addition & 0 deletions DEVELOPING.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ In order to test and develop in this repo you will need the following dependenci
- Golang
- docker
- make
- Python (>= 3.9)

### Docker settings for getting started
Make sure you've updated your docker settings so the default docker socket path is available.
Expand Down
6 changes: 5 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ all: static-analysis test ## Run all linux-based checks (linting, license check,
static-analysis: check-go-mod-tidy lint check-licenses check-json-schema-drift ## Run all static analysis checks

.PHONY: test
test: unit integration validate-cyclonedx-schema benchmark cli ## Run all tests (currently unit, integration, linux compare, and cli tests)
test: unit integration validate-cyclonedx-schema benchmark test-utils cli ## Run all tests (currently unit, integration, linux compare, and cli tests)


## Bootstrapping targets #################################
Expand Down Expand Up @@ -167,6 +167,10 @@ cli: $(SNAPSHOT_DIR) ## Run CLI tests
SYFT_BINARY_LOCATION='$(SNAPSHOT_BIN)' \
go test -count=1 -timeout=15m -v ./test/cli

.PHONY: test-utils
test-utils:
python .github/scripts/labeler_test.py


## Benchmark test targets #################################

Expand Down

0 comments on commit 8f6bdde

Please # to comment.