fix: fix incorrect skip result evaluation causing false positives in PyPI malware reporting (#1031)

art1f1c3R · web-flow · commit 199809ed3353 · 2025-04-09T16:26:26.000+10:00
Resolved issue in ProbLog model where skip results were evaluated as false, causing many false positives. Rule IDs have also been added.

Signed-off-by: Carl Flottmann &lt;carl.flottmann@oracle.com&gt;
diff --git a/src/macaron/malware_analyzer/README.md b/src/macaron/malware_analyzer/README.md
@@ -61,8 +61,9 @@ When contributing an analyzer, it must meet the following requirements:
 - The analyzer name must be added to [heuristics.py](./pypi_heuristics/heuristics.py) file so it can be used for rule combinations in [detect_malicious_metadata_check.py](../slsa_analyzer/checks/detect_malicious_metadata_check.py)
 - Update the `malware_rules_problog_model` in [detect_malicious_metadata_check.py](../slsa_analyzer/checks/detect_malicious_metadata_check.py) with logical statements where the heuristic should be included. When adding new rules, please follow the following guidelines:
    - Provide a [confidence value](../slsa_analyzer/checks/check_result.py) using the `Confidence` enum.
-   - Provide a name based on this confidence value (i.e. `high`, `medium`, or `low`)
-   - If it does not already exist, make sure to assign this to the result variable (`problog_result_access`)
+   - Ensure it is assigned to the `problog_result_access` string variable, otherwise it will not be queried and evaluated.
+   - Assign a rule ID to the rule. This will be used to backtrack to determine if it was triggered.
+   - Make sure to wrap pass/fail statements in `passed()` and `failed()`. Not doing so may result in undesirable behaviour, see the comments in the model for more details.
    - If there are commonly used combinations introduced by adding the heuristic, combine and justify them at the top of the static model (see `quickUndetailed` and `forceSetup` as current examples).  
 
 ### Confidence Score Motivation
diff --git a/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py b/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py
@@ -128,7 +128,9 @@ def validate_malware(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[boo
         is_malware, detail_info = sourcecode_analyzer.analyze()
         return is_malware, detail_info
 
-    def evaluate_heuristic_results(self, heuristic_results: dict[Heuristics, HeuristicResult]) -> float | None:
+    def evaluate_heuristic_results(
+        self, heuristic_results: dict[Heuristics, HeuristicResult]
+    ) -> tuple[float, JsonType]:
         """Analyse the heuristic results to determine the maliciousness of the package.
 
         Parameters
@@ -138,18 +140,19 @@ def evaluate_heuristic_results(self, heuristic_results: dict[Heuristics, Heurist
 
         Returns
         -------
-        float | None
-            Returns the confidence associated with the detected malicious combination, otherwise None if no associated
-            malicious combination was triggered.
+        tuple[float, JsonType]
+            Returns the confidence associated with the detected malicious combination, and associated rule IDs detailing
+            what rules were triggered and their confidence as a dict[str, float] type.
         """
         facts_list: list[str] = []
+        triggered_rules: dict[str, JsonType] = {}
+
         for heuristic, result in heuristic_results.items():
-            if result == HeuristicResult.SKIP:
-                facts_list.append(f"0.0::{heuristic.value}.")
-            elif result == HeuristicResult.PASS:
+            if result == HeuristicResult.PASS:
                 facts_list.append(f"{heuristic.value} :- true.")
-            else:  # HeuristicResult.FAIL
+            elif result == HeuristicResult.FAIL:
                 facts_list.append(f"{heuristic.value} :- false.")
+            # Do not define for HeuristicResult.SKIP
 
         facts = "\n".join(facts_list)
         problog_code = f"{facts}\n\n{self.malware_rules_problog_model}"
@@ -158,10 +161,13 @@ def evaluate_heuristic_results(self, heuristic_results: dict[Heuristics, Heurist
         problog_model = PrologString(problog_code)
         problog_results: dict[Term, float] = get_evaluatable().create_from(problog_model).evaluate()
 
-        confidence: float | None = problog_results.get(Term(self.problog_result_access))
-        if confidence == 0.0:
-            return None  # no rules were triggered
-        return confidence
+        confidence = problog_results.pop(Term(self.problog_result_access), 0.0)
+        if confidence > 0:  # a rule was triggered
+            for term, conf in problog_results.items():
+                if term.args:
+                    triggered_rules[str(term.args[0])] = conf
+
+        return confidence, triggered_rules
 
     def run_heuristics(
         self, pypi_package_json: PyPIPackageJsonAsset
@@ -278,9 +284,10 @@ def run_check(self, ctx: AnalyzeContext) -> CheckResultData:
                         except HeuristicAnalyzerValueError:
                             return CheckResultData(result_tables=[], result_type=CheckResultType.UNKNOWN)
 
-                        confidence = self.evaluate_heuristic_results(result)
+                        confidence, triggered_rules = self.evaluate_heuristic_results(result)
+                        detail_info["triggered_rules"] = triggered_rules
                         result_type = CheckResultType.FAILED
-                        if confidence is None:
+                        if not confidence:
                             confidence = Confidence.HIGH
                             result_type = CheckResultType.PASSED
                         elif ctx.dynamic_data["validate_malware"]:
@@ -321,51 +328,79 @@ def run_check(self, ctx: AnalyzeContext) -> CheckResultData:
         AnomalousVersionAnalyzer,
     ]
 
+    # name used to query the result of all problog rules, so it can be accessed outside the model.
     problog_result_access = "result"
 
     malware_rules_problog_model = f"""
-    % Heuristic groupings
+    % ----- Wrappers ------
+    % When a heuristic is skipped, it is ommitted from the problog model facts definition. This means that references in this
+    % static model must account for when they are not existent. These wrappers perform this function using the inbuilt try_call
+    % problog function. It will try to evaluate the provided logic, and return false if it encounters an error, such as the fact
+    % not being defined. For example, you are expecting A to pass, so we do:
+    %
+    % passed(A)
+    %
+    % If A was 'true', then this will return true, as A did pass. If A was 'false', then this will return false, as A did not pass.
+    % If A was not defined, then this will return false, as A did not pass.
+    % Please use these wrappers throughout the problog model for logic definitions.
+
+    passed(H) :- try_call(H).
+    failed(H) :- try_call(not H).
+
+    % ----- Heuristic groupings -----
     % These are common combinations of heuristics that are used in many of the rules, thus themselves representing
     % certain behaviors. When changing or adding rules here, if there are frequent combinations of particular
     % heuristics, group them together here.
 
     % Maintainer has recently joined, publishing an undetailed page with no links.
-    quickUndetailed :- not {Heuristics.EMPTY_PROJECT_LINK.value}, not {Heuristics.CLOSER_RELEASE_JOIN_DATE.value}.
+    quickUndetailed :- failed({Heuristics.EMPTY_PROJECT_LINK.value}), failed({Heuristics.CLOSER_RELEASE_JOIN_DATE.value}).
 
     % Maintainer releases a suspicious setup.py and forces it to run by omitting a .whl file.
-    forceSetup :- not {Heuristics.SUSPICIOUS_SETUP.value}, not {Heuristics.WHEEL_ABSENCE.value}.
+    forceSetup :- failed({Heuristics.SUSPICIOUS_SETUP.value}), failed({Heuristics.WHEEL_ABSENCE.value}).
 
-    % Suspicious Combinations
+    % ----- Suspicious Combinations -----
 
     % Package released recently with little detail, forcing the setup.py to run.
-    {Confidence.HIGH.value}::high :- quickUndetailed, forceSetup, not {Heuristics.ONE_RELEASE.value}.
-    {Confidence.HIGH.value}::high :- quickUndetailed, forceSetup, not {Heuristics.HIGH_RELEASE_FREQUENCY.value}.
+    {Confidence.HIGH.value}::trigger(malware_high_confidence_1) :-
+        quickUndetailed, forceSetup, failed({Heuristics.ONE_RELEASE.value}).
+    {Confidence.HIGH.value}::trigger(malware_high_confidence_2) :-
+        quickUndetailed, forceSetup, failed({Heuristics.HIGH_RELEASE_FREQUENCY.value}).
 
     % Package released recently with little detail, with some more refined trust markers introduced: project links,
     % multiple different releases, but there is no source code repository matching it and the setup is suspicious.
-    {Confidence.HIGH.value}::high :- not {Heuristics.SOURCE_CODE_REPO.value},
-        not {Heuristics.HIGH_RELEASE_FREQUENCY.value},
-        not {Heuristics.CLOSER_RELEASE_JOIN_DATE.value},
-        {Heuristics.UNCHANGED_RELEASE.value},
+    {Confidence.HIGH.value}::trigger(malware_high_confidence_3) :-
+        failed({Heuristics.SOURCE_CODE_REPO.value}),
+        failed({Heuristics.HIGH_RELEASE_FREQUENCY.value}),
+        passed({Heuristics.UNCHANGED_RELEASE.value}),
+        failed({Heuristics.CLOSER_RELEASE_JOIN_DATE.value}),
         forceSetup.
 
     % Package released recently with little detail, with multiple releases as a trust marker, but frequent and with
     % the same code.
-    {Confidence.MEDIUM.value}::medium :- quickUndetailed,
-        not {Heuristics.HIGH_RELEASE_FREQUENCY.value},
-        not {Heuristics.UNCHANGED_RELEASE.value},
-        {Heuristics.SUSPICIOUS_SETUP.value}.
+    {Confidence.MEDIUM.value}::trigger(malware_medium_confidence_1) :-
+        quickUndetailed,
+        failed({Heuristics.HIGH_RELEASE_FREQUENCY.value}),
+        failed({Heuristics.UNCHANGED_RELEASE.value}),
+        passed({Heuristics.SUSPICIOUS_SETUP.value}).
 
     % Package released recently with little detail and an anomalous version number for a single-release package.
-    {Confidence.MEDIUM.value}::medium :- quickUndetailed,
-        not {Heuristics.ONE_RELEASE.value},
-        {Heuristics.WHEEL_ABSENCE.value},
-        not {Heuristics.ANOMALOUS_VERSION.value}.
-
-    {problog_result_access} :- high.
-    {problog_result_access} :- medium.
-
+    {Confidence.MEDIUM.value}::trigger(malware_medium_confidence_2) :-
+        quickUndetailed,
+        failed({Heuristics.ONE_RELEASE.value}),
+        failed({Heuristics.ANOMALOUS_VERSION.value}).
+
+    % ----- Evaluation -----
+
+    % Aggregate result
+    {problog_result_access} :- trigger(malware_high_confidence_1).
+    {problog_result_access} :- trigger(malware_high_confidence_2).
+    {problog_result_access} :- trigger(malware_high_confidence_3).
+    {problog_result_access} :- trigger(malware_medium_confidence_2).
+    {problog_result_access} :- trigger(malware_medium_confidence_1).
     query({problog_result_access}).
+
+    % Explainability
+    query(trigger(_)).
     """
 
 
diff --git a/tests/slsa_analyzer/checks/test_detect_malicious_metadata_check.py b/tests/slsa_analyzer/checks/test_detect_malicious_metadata_check.py
@@ -12,6 +12,7 @@
 from pytest_httpserver import HTTPServer
 
 from macaron.config.defaults import load_defaults
+from macaron.malware_analyzer.pypi_heuristics.heuristics import HeuristicResult, Heuristics
 from macaron.slsa_analyzer.build_tool.base_build_tool import BaseBuildTool
 from macaron.slsa_analyzer.checks.check_result import CheckResultType
 from macaron.slsa_analyzer.checks.detect_malicious_metadata_check import DetectMaliciousMetadataCheck
@@ -98,3 +99,34 @@ def test_detect_malicious_metadata(
     ).respond_with_json({})
 
     assert check.run_check(ctx).result_type == expected
+
+
+@pytest.mark.parametrize(
+    ("combination"),
+    [
+        pytest.param(
+            {
+                # similar to rule ID malware_high_confidence_1, but SUSPICIOUS_SETUP is skipped since the file does not exist,
+                # so the rule should not trigger.
+                Heuristics.EMPTY_PROJECT_LINK: HeuristicResult.FAIL,
+                Heuristics.SOURCE_CODE_REPO: HeuristicResult.SKIP,
+                Heuristics.ONE_RELEASE: HeuristicResult.FAIL,
+                Heuristics.HIGH_RELEASE_FREQUENCY: HeuristicResult.SKIP,
+                Heuristics.UNCHANGED_RELEASE: HeuristicResult.SKIP,
+                Heuristics.CLOSER_RELEASE_JOIN_DATE: HeuristicResult.FAIL,
+                Heuristics.SUSPICIOUS_SETUP: HeuristicResult.SKIP,
+                Heuristics.WHEEL_ABSENCE: HeuristicResult.FAIL,
+                Heuristics.ANOMALOUS_VERSION: HeuristicResult.PASS,
+            },
+            id="test_skipped_evaluation",
+        )
+    ],
+)
+def test_evaluations(combination: dict[Heuristics, HeuristicResult]) -> None:
+    """Test heuristic combinations to ensure they evaluate as expected."""
+    check = DetectMaliciousMetadataCheck()
+
+    confidence, triggered_rules = check.evaluate_heuristic_results(combination)
+    assert confidence == 0
+    # Expecting this to be a dictionary, so we can ignore the type problems
+    assert len(dict(triggered_rules)) == 0  # type: ignore[arg-type]