Remove Python code from JSON rules #1745

* New rules: /scancode-categories/src/json_rules/json_rules_simple_01.json * Seems to work well on test codebase bionic-master-libc-bionic.tar.gz-extract (largely C++). * Next steps include expanding rules using more-diverse test codebases. * No formal test suite yet but coming soon. * This branch also includes code for 'Hello ScanCode' plugin illustrated in ScanCode wiki entry 'How To: Add a post scan plugin' (see /scancode-hello/). Signed-off-by: John M. Horan <johnmhoran@gmail.com>
aboutcode-org · Oct 10, 2019 · f0417d7 · f0417d7
1 parent b6287a3
commit f0417d7
Show file tree

Hide file tree

Showing 15 changed files with 937 additions and 8 deletions.
diff --git a/plugins/scancode-categories/src/json_rules/json_rules_simple_01.json b/plugins/scancode-categories/src/json_rules/json_rules_simple_01.json
@@ -0,0 +1,116 @@
+{
+    "new_rules": [
+        {
+            "rule": "Blueprint files",
+            "domain": "General",
+            "notes": "This is a non-core Blueprint file.",
+            "status": "Non-core Blueprint file",
+            "test": [
+                {
+                    "operator": "and",
+                    "extension": [".bp"],
+                    "file_type": ["ASCII text"],
+                    "mime_type": [],
+                    "name": [],
+                    "programming_language": []
+                }
+            ]
+        },
+        {
+            "rule": "C++ files",
+            "domain": "General",
+            "notes": "This is a C++ file.",
+            "status": "Core code",
+            "test": [
+                {
+                    "operator": "and",
+                    "extension": [".cpp"],
+                    "file_type": ["C source, ASCII text", "C++ source, ASCII text", "ASCII text"],
+                    "mime_type": [],
+                    "name": [],
+                    "programming_language": ["C++"]
+                }
+            ]
+        },
+        {
+            "rule": "C++ files with .c extension",
+            "domain": "General",
+            "notes": "This is a C++ file with a .c extension.",
+            "status": "Core code",
+            "test": [
+                {
+                    "operator": "and",
+                    "extension": [".c"],
+                    "file_type": ["C source, ASCII text"],
+                    "mime_type": [],
+                    "name": [],
+                    "programming_language": ["C++"]
+                }
+            ]
+        },
+        {
+            "rule": "C++ header files",
+            "domain": "General",
+            "notes": "This is a C++ header file, i.e., with a .h extension.",
+            "status": "Core code",
+            "test": [
+                {
+                    "operator": "and",
+                    "extension": [".h"],
+                    "file_type": ["ASCII text", "C++ source, ASCII text", "C source, ASCII text"],
+                    "mime_type": [],
+                    "name": [],
+                    "programming_language": ["C++"]
+                }
+            ]
+        },
+        {
+            "rule": "JavaScript map files",
+            "domain": "General",
+            "notes": "This is a non-core JavaScript map file.",
+            "status": "Non-core JavaScript map file",
+            "test": [
+                {
+                    "operator": "and",
+                    "extension": [".map"],
+                    "file_type": ["ASCII text"],
+                    "mime_type": [],
+                    "name": [],
+                    "programming_language": []
+                }
+            ]
+        },
+        {
+            "rule": "Test: all empty values",
+            "domain": "General",
+            "notes": "This is an empty value test -- every file (but no directories) should pass.",
+            "status": "Empty value test",
+            "test": [
+                {
+                    "operator": "and",
+                    "extension": [],
+                    "file_type": [],
+                    "mime_type": [],
+                    "name": [],
+                    "programming_language": []
+                }
+            ]
+        },
+        {
+            "rule": "Test: specific file name with OR operator",
+            "domain": "General",
+            "notes": "This is a file name OR test -- it identifies 2 specific files in the libc-bionic archive.",
+            "status": "File name OR test",
+            "test": [
+                {
+                    "operator": "or",
+                    "extension": ["any_value"],
+                    "file_type": ["any_value"],
+                    "mime_type": ["any_value"],
+                    "name": ["scudo.cpp", "exported64.map"],
+                    "programming_language": ["any_value"]
+                }
+            ]
+        }
+    ]
+}
diff --git a/plugins/scancode-categories/src/plugin_categories/plugin_categories.py b/plugins/scancode-categories/src/plugin_categories/plugin_categories.py
@@ -46,7 +46,7 @@ class CategoryRules(PostScanPlugin):
 
     options = [
         CommandLineOption(('--categories',),
-                                        help='Identify the category (e.g., Java, JavaScript, Python) for each file in the codebase being scanned.  Rules comprise a set of any() and all() functions contained as string values in a list of JSON objects.  The category and related information (including the rule applied to the file) will be added to a new "category" field in the ScanCode JSON output file.',
+                                        help='Identify the category (e.g., Java, JavaScript, Python) for each file in the codebase being scanned.  Rules comprise a set of field values( e.g., file_type and mime_type) contained as string values in a list of JSON objects.  The category and related information (including the rule applied to the file) will be added to a new "category" field in the ScanCode JSON output file.',
                                         metavar='FILE',
                                         help_group=POST_SCAN_GROUP)
     ]
@@ -74,14 +74,49 @@ def process_codebase(self, codebase, categories, **kwargs):
     def vet_resource(self, resource, categories, data, **kwargs):
         matched_rules = []
         resource.category = matched_rules
+        #
         for i in data["new_rules"]:
-            scope = locals()
-            if eval(i["test"], scope):
-                if resource.type == 'directory':
-                    resource.category = 'directory'
-                elif resource.type == 'file':
-                    matched_rules.append(OrderedDict((k, i[k]) for k in ('name', 'test', 'domain', 'status')))
-                    resource.category = matched_rules
+            if resource.type == 'directory':
+                resource.category = 'directory'
+            elif resource.type == 'file':
+                for test in i["test"]:
+                    # if the test value list is empty, ignore by defining as true
+                    if not test["extension"]:
+                        extension_test = (0 == 0)
+                    else:
+                        extension_test = any(extension == resource.extension for extension in test["extension"])
+
+                    if not test["file_type"]:
+                        file_type_test = (0 == 0)
+                    else:
+                        file_type_test = any(file_type == resource.file_type for file_type in test["file_type"])
+
+                    if not test["mime_type"]:
+                        mime_type_test = (0 == 0)
+                    else:
+                        mime_type_test = any(mime_type == resource.mime_type for mime_type in test["mime_type"])
+
+                    if not test["name"]:
+                        name_test = (0 == 0)
+                    else:
+                        name_test = any(name == resource.name for name in test["name"])
+
+                    if not test["programming_language"]:
+                        programming_language_test = (0 == 0)
+                    else:
+                        programming_language_test = any(programming_language == resource.programming_language for programming_language in test["programming_language"])
+
+                    # define the AND and OR tests
+                    and_tests = extension_test & file_type_test & mime_type_test & name_test & programming_language_test
+                    or_tests = extension_test | file_type_test | mime_type_test | name_test | programming_language_test
+
+                    # check whether operator is AND or OR
+                    if test["operator"] == "and":
+                        if and_tests:
+                            self.create_category(i, test, resource, matched_rules)
+                    elif test["operator"] == "or":
+                        if or_tests:
+                            self.create_category(i, test, resource, matched_rules)
 
         if not resource.category:
             if resource.type == 'directory':
@@ -90,3 +125,12 @@ def vet_resource(self, resource, categories, data, **kwargs):
                 resource.category = "no match"
 
         return resource
+
+    def create_category(self, i, test, resource, matched_rules):
+        i = OrderedDict((k, i[k]) for k in ('rule', 'domain', 'notes', 'status', 'test'))
+        d2 = OrderedDict((k, test[k]) for k in ('operator', 'extension', 'file_type', 'mime_type', 'name', 'programming_language'))
+        i["test"] = d2
+        matched_rules.append(i)
+        resource.category = matched_rules
+
+        return resource
diff --git a/plugins/scancode-categories/src/plugin_categories/plugin_categories_test_05.py b/plugins/scancode-categories/src/plugin_categories/plugin_categories_test_05.py
@@ -0,0 +1,92 @@
+#
+# Copyright (c) 2019 nexB Inc. and others. All rights reserved.
+# http://nexb.com and https://github.com/nexB/scancode-toolkit/
+# The ScanCode software is licensed under the Apache License version 2.0.
+# Data generated with ScanCode require an acknowledgment.
+# ScanCode is a trademark of nexB Inc.
+#
+# You may not use this software except in compliance with the License.
+# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+#
+# When you publish or redistribute any data created with ScanCode or any ScanCode
+# derivative work, you must accompany this data with the following acknowledgment:
+#
+#  Generated with ScanCode and provided on an "AS IS" BASIS, WITHOUT WARRANTIES
+#  OR CONDITIONS OF ANY KIND, either express or implied. No content created from
+#  ScanCode should be considered or used as legal advice. Consult an Attorney
+#  for any legal advice.
+#  ScanCode is a free software code scanning tool from nexB Inc. and others.
+#  Visit https://github.com/nexB/scancode-toolkit/ for support and download.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from collections import OrderedDict
+
+import attr
+import json
+
+from plugincode.post_scan import PostScanPlugin
+from plugincode.post_scan import post_scan_impl
+from scancode import CommandLineOption
+from scancode import POST_SCAN_GROUP
+
+
+@post_scan_impl
+class CategoryRules(PostScanPlugin):
+    """
+    Identify the category (e.g., Java, JavaScript, Python) for each file in the codebase being scanned.
+    """
+
+    options = [
+        CommandLineOption(('--categories',),
+                                        help='Identify the category (e.g., Java, JavaScript, Python) for each file in the codebase being scanned.  Rules comprise a set of any() and all() functions contained as string values in a list of JSON objects.  The category and related information (including the rule applied to the file) will be added to a new "category" field in the ScanCode JSON output file.',
+                                        metavar='FILE',
+                                        help_group=POST_SCAN_GROUP)
+    ]
+
+    resource_attributes = dict(category=attr.ib(default=attr.Factory(dict)))
+
+    def is_enabled(self, categories, **kwargs):
+        return categories
+
+    def process_codebase(self, codebase, categories, **kwargs):
+        """
+        Populate a category mapping.
+        """
+        if not self.is_enabled(categories):
+            return
+
+        ruleset_path = categories
+        with open(ruleset_path) as json_file:
+            data = json.load(json_file)
+
+            for resource in codebase.walk(topdown=False):
+                self.vet_resource(resource, categories, data)
+                codebase.save_resource(resource)
+
+    def vet_resource(self, resource, categories, data, **kwargs):
+        matched_rules = []
+        resource.category = matched_rules
+        for i in data["new_rules"]:
+            scope = locals()
+            if eval(i["test"], scope):
+                if resource.type == 'directory':
+                    resource.category = 'directory'
+                elif resource.type == 'file':
+                    matched_rules.append(OrderedDict((k, i[k]) for k in ('name', 'test', 'domain', 'status')))
+                    resource.category = matched_rules
+
+        if not resource.category:
+            if resource.type == 'directory':
+                resource.category = 'directory'
+            else:
+                resource.category = "no match"
+
+        return resource