From b6287a3f70d98de69bdcb39cb6c1fcdbf5a8db15 Mon Sep 17 00:00:00 2001
From: "John M. Horan" <johnmhoran@gmail.com>
Date: Mon, 7 Oct 2019 12:13:24 -0700
Subject: [PATCH] Test .json ruleset #1745

* .json ruleset performs as intended.
* Cleaned up plugin_categories.py

Signed-off-by: John M. Horan <johnmhoran@gmail.com>
---
 .../src/json_rules/json_rules_01.json         |  34 +++
 .../plugin_categories/plugin_categories.py    | 137 +----------
 .../plugin_categories_test_04.py              | 219 ++++++++++++++++++
 3 files changed, 258 insertions(+), 132 deletions(-)
 create mode 100644 plugins/scancode-categories/src/json_rules/json_rules_01.json
 create mode 100644 plugins/scancode-categories/src/plugin_categories/plugin_categories_test_04.py

diff --git a/plugins/scancode-categories/src/json_rules/json_rules_01.json b/plugins/scancode-categories/src/json_rules/json_rules_01.json
new file mode 100644
index 00000000000..40d59fca12c
--- /dev/null
+++ b/plugins/scancode-categories/src/json_rules/json_rules_01.json
@@ -0,0 +1,34 @@
+{
+    "new_rules": [
+        {
+            "name": "cpp_files01_different_directory_inside_scancode_json_rules_file",
+            "test": "all(extension == resource.extension for extension in ['.cpp']) & any(file_type == resource.file_type for file_type in ['C source, ASCII text', 'C++ source, ASCII text', 'ASCII text'])",
+            "domain": "general",
+            "status": "core code"
+        },
+        {
+            "name": "cpp_files_c_extension01_different_directory_inside_scancode_json_rules_file",
+            "test": "all(extension == resource.extension for extension in ['.c']) & any(file_type == resource.file_type for file_type in ['C source, ASCII text']) & all(programming_language == resource.programming_language for programming_language in ['C++'])",
+            "domain": "general",
+            "status": "core code"
+        },
+        {
+            "name": "cpp_files_header01_different_directory_inside_scancode_json_rules_file",
+            "test": "all(extension == resource.extension for extension in ['.h']) & any(file_type == resource.file_type for file_type in ['ASCII text', 'C++ source, ASCII text', 'C source, ASCII text'])",
+            "domain": "general",
+            "status": "core code"
+        },
+        {
+            "name": "map_files01_different_directory_inside_scancode_json_rules_file",
+            "test": "all(extension == resource.extension for extension in ['.map']) & any(file_type == resource.file_type for file_type in ['ASCII text'])",
+            "domain": "general",
+            "status": "non-core JavaScript map file"
+        },
+        {
+            "name": "Blueprint_files01_different_directory_inside_scancode_json_rules_file",
+            "test": "all(extension == resource.extension for extension in ['.bp']) & any(file_type == resource.file_type for file_type in ['ASCII text'])",
+            "domain": "general",
+            "status": "non-core build configuration"
+        }
+    ]
+}
diff --git a/plugins/scancode-categories/src/plugin_categories/plugin_categories.py b/plugins/scancode-categories/src/plugin_categories/plugin_categories.py
index ac4f7edae26..2a94bbe3cbe 100644
--- a/plugins/scancode-categories/src/plugin_categories/plugin_categories.py
+++ b/plugins/scancode-categories/src/plugin_categories/plugin_categories.py
@@ -30,11 +30,7 @@
 from collections import OrderedDict
 
 import attr
-
-# from commoncode import saneyaml
-# import os.path
-# from os.path import exists
-# from os.path import isdir
+import json
 
 from plugincode.post_scan import PostScanPlugin
 from plugincode.post_scan import post_scan_impl
@@ -62,148 +58,25 @@ def is_enabled(self, categories, **kwargs):
 
     def process_codebase(self, codebase, categories, **kwargs):
         """
-        Populate a category mapping with two attributes: category_type and category_rule
-        at the File Resource level.
+        Populate a category mapping.
         """
         if not self.is_enabled(categories):
             return
 
-        # with open('/c/code/nexb/dev/scancode-toolkit/plugins/scancode-categories/src/python_rules/python_rules_01.py', 'r') as f:
-        #     line = f.readline()
-        #     header = []
-        #     data = []
-        #     while line:
-        #         if line.startswith('#'):
-        #             header.append(line)
-        #         else:
-        #             data.append(line)
-        #         line = f.readline()
-
-        import json
-        # s = '/c/code/nexb/dev/scancode-toolkit/plugins/scancode-categories/src/python_rules/python_rules_01.py'
-        # s = r'C:\code\nexb\dev\scancode-toolkit\plugins\scancode-categories\src\python_rules\python_rules_01.py'
-        # s = r'C:\nexb\scancode_plugin_tests\rules\python_rules_01.py'
-        # does this represent the path passed with the command?
-        s = categories
-
-        # jdata = json.loads(s)
-        # print(jdata)
-        # for d in jdata:
-        #     for key, value in d.items():
-        #         print(key, value)
-        # ============================================
-        # # this prints as expected, but test value must be in quotes
-
-        # my_path = os.path.abspath(os.path.dirname(__file__))
-        # # my_rules = os.path.join(my_path, 'test_rules_simple_01.py')
-        # # try different folder
-        # my_rules = os.path.join(my_path, '../python_rules/python_rules_01.py')
-        # with open(my_rules) as json_file:
-        #     data = json.load(json_file)
-
-        # try to import with full path from different directory
-        with open(s) as json_file:
+        ruleset_path = categories
+        with open(ruleset_path) as json_file:
             data = json.load(json_file)
 
-
-
-
-            # print('\nHELLO\n')
-            # for p in data['new_rules']:
-            #     print('Name: ' + p['name'])
-            #     print('Test: ' + p['test'])
-            #     print('')
-        # ============================================
-
-        # # apply rules to resource
-        # for resource in codebase.walk(topdown=False):
-        #     self.vet_resource(resource, categories)
-        #     codebase.save_resource(resource)
-
-            # same as above but indented one level so we can pass "data" from json.load above
             for resource in codebase.walk(topdown=False):
-                # self.vet_resource(resource, categories)
                 self.vet_resource(resource, categories, data)
                 codebase.save_resource(resource)
 
-    # def vet_resource(self, resource, categories, **kwargs):
     def vet_resource(self, resource, categories, data, **kwargs):
-        # ==================================================
-        # Next: try to read from list inside python_rules_01.py instead of the list below
-        # ==================================================
-        # new_rules = [
-        #     {
-        #         'name': 'cpp_files_01',
-        #         "test": (all(extension == resource.extension for extension in [".cpp"]) &
-        #                     any(file_type == resource.file_type for file_type in ["C source, ASCII text", "C++ source, ASCII text"])),
-        #         'domain': 'general',
-        #         'status': 'core'
-        #     },
-        #     {
-        #         'name': 'map_files_01',
-        #         "test": (any(extension == resource.extension for extension in [".map"]) &
-        #                     any(file_type == resource.file_type for file_type in ["ASCII text"])),
-        #         'domain': 'general',
-        #         'status': 'non-core'
-        #     },
-        #     {
-        #         'name': 'c_files_01',
-        #         "test": (all(resource.extension == extension for extension in [".c"]) &
-        #                     any(resource.file_type == file_type for file_type in ["C source, ASCII text"])),
-        #         'domain': 'general',
-        #         'status': 'core'
-        #     },
-        # ]
-
-        # # # with open('/c/code/nexb/dev/scancode-toolkit/plugins/scancode-categories/src/python_rules/python_rules_01.py', 'r') as f:
-        # # with open('../python_rules/python_rules_01.py', 'r') as f:
-        # #     new_rules = [line for line in f]
-
-        # new_path = os.path.abspath(os.path.dirname(__file__))
-        # new_yamlRules = os.path.join(new_path, '../python_rules/python_rules_01.py')
-        # with open(new_yamlRules) as f:
-        #     new_rules = [line for line in f]
-        # # # #     # use safe_load instead load
-        # # # #     # rules_map = yaml.safe_load(f)
-        # # #     new_rules_map = saneyaml.load(f)
-
-        # from python_rules_01 import new_rules
-
-        # import sys
-        # import os
-        # sys.path.append(os.path.abspath("../../python_rules"))
-        # sys.path.append(os.path.abspath("/c/code/nexb/dev/scancode-toolkit/plugins/scancode-categories/src/python_rules/python_rules_01.py"))
-        # sys.path.append(os.path.abspath("/c/code/nexb/dev/scancode-toolkit/plugins/scancode-categories/src/python_rules"))
-
-        # all these fail: no module named x
-        # from python_rules_01 import new_rules
-        # from test_rules import new_rules
-        # import test_rules
-
-
-        # use list located in this same file
-        # now we get it fropm a .py file in the same directory
         matched_rules = []
         resource.category = matched_rules
-        # for i in new_rules:
         for i in data["new_rules"]:
-            # # if i["test"]:
-            # convert_to_code = i["test"]
-            # if convert_to_code:
-            # if exec(i["test"]):
-            # execute = exec(i["test"])
-            # execute = exec(i["test"].decode("utf-8"))
-
-            # decode_value = i["test"].decode("utf-8")
-            # decode_value = i["test"]  # no longer necessary -- see <if eval(i["test"], scope):> below
-
-            # execute = exec(decode_value)
-            # if execute:
-            # if exec(decode_value):
-            scope = locals()  # this captures all local variables, which we pass in eval() just below so the json "test" value can use "resource"!
-            # if eval(decode_value, scope):
+            scope = locals()
             if eval(i["test"], scope):
-                # with the tests cleaned up, not sure if the 1st part of this if/else is still necessary -- needs further testing
                 if resource.type == 'directory':
                     resource.category = 'directory'
                 elif resource.type == 'file':
diff --git a/plugins/scancode-categories/src/plugin_categories/plugin_categories_test_04.py b/plugins/scancode-categories/src/plugin_categories/plugin_categories_test_04.py
new file mode 100644
index 00000000000..ac4f7edae26
--- /dev/null
+++ b/plugins/scancode-categories/src/plugin_categories/plugin_categories_test_04.py
@@ -0,0 +1,219 @@
+#
+# Copyright (c) 2019 nexB Inc. and others. All rights reserved.
+# http://nexb.com and https://github.com/nexB/scancode-toolkit/
+# The ScanCode software is licensed under the Apache License version 2.0.
+# Data generated with ScanCode require an acknowledgment.
+# ScanCode is a trademark of nexB Inc.
+#
+# You may not use this software except in compliance with the License.
+# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+#
+# When you publish or redistribute any data created with ScanCode or any ScanCode
+# derivative work, you must accompany this data with the following acknowledgment:
+#
+#  Generated with ScanCode and provided on an "AS IS" BASIS, WITHOUT WARRANTIES
+#  OR CONDITIONS OF ANY KIND, either express or implied. No content created from
+#  ScanCode should be considered or used as legal advice. Consult an Attorney
+#  for any legal advice.
+#  ScanCode is a free software code scanning tool from nexB Inc. and others.
+#  Visit https://github.com/nexB/scancode-toolkit/ for support and download.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from collections import OrderedDict
+
+import attr
+
+# from commoncode import saneyaml
+# import os.path
+# from os.path import exists
+# from os.path import isdir
+
+from plugincode.post_scan import PostScanPlugin
+from plugincode.post_scan import post_scan_impl
+from scancode import CommandLineOption
+from scancode import POST_SCAN_GROUP
+
+
+@post_scan_impl
+class CategoryRules(PostScanPlugin):
+    """
+    Identify the category (e.g., Java, JavaScript, Python) for each file in the codebase being scanned.
+    """
+
+    options = [
+        CommandLineOption(('--categories',),
+                                        help='Identify the category (e.g., Java, JavaScript, Python) for each file in the codebase being scanned.  Rules comprise a set of any() and all() functions contained as string values in a list of JSON objects.  The category and related information (including the rule applied to the file) will be added to a new "category" field in the ScanCode JSON output file.',
+                                        metavar='FILE',
+                                        help_group=POST_SCAN_GROUP)
+    ]
+
+    resource_attributes = dict(category=attr.ib(default=attr.Factory(dict)))
+
+    def is_enabled(self, categories, **kwargs):
+        return categories
+
+    def process_codebase(self, codebase, categories, **kwargs):
+        """
+        Populate a category mapping with two attributes: category_type and category_rule
+        at the File Resource level.
+        """
+        if not self.is_enabled(categories):
+            return
+
+        # with open('/c/code/nexb/dev/scancode-toolkit/plugins/scancode-categories/src/python_rules/python_rules_01.py', 'r') as f:
+        #     line = f.readline()
+        #     header = []
+        #     data = []
+        #     while line:
+        #         if line.startswith('#'):
+        #             header.append(line)
+        #         else:
+        #             data.append(line)
+        #         line = f.readline()
+
+        import json
+        # s = '/c/code/nexb/dev/scancode-toolkit/plugins/scancode-categories/src/python_rules/python_rules_01.py'
+        # s = r'C:\code\nexb\dev\scancode-toolkit\plugins\scancode-categories\src\python_rules\python_rules_01.py'
+        # s = r'C:\nexb\scancode_plugin_tests\rules\python_rules_01.py'
+        # does this represent the path passed with the command?
+        s = categories
+
+        # jdata = json.loads(s)
+        # print(jdata)
+        # for d in jdata:
+        #     for key, value in d.items():
+        #         print(key, value)
+        # ============================================
+        # # this prints as expected, but test value must be in quotes
+
+        # my_path = os.path.abspath(os.path.dirname(__file__))
+        # # my_rules = os.path.join(my_path, 'test_rules_simple_01.py')
+        # # try different folder
+        # my_rules = os.path.join(my_path, '../python_rules/python_rules_01.py')
+        # with open(my_rules) as json_file:
+        #     data = json.load(json_file)
+
+        # try to import with full path from different directory
+        with open(s) as json_file:
+            data = json.load(json_file)
+
+
+
+
+            # print('\nHELLO\n')
+            # for p in data['new_rules']:
+            #     print('Name: ' + p['name'])
+            #     print('Test: ' + p['test'])
+            #     print('')
+        # ============================================
+
+        # # apply rules to resource
+        # for resource in codebase.walk(topdown=False):
+        #     self.vet_resource(resource, categories)
+        #     codebase.save_resource(resource)
+
+            # same as above but indented one level so we can pass "data" from json.load above
+            for resource in codebase.walk(topdown=False):
+                # self.vet_resource(resource, categories)
+                self.vet_resource(resource, categories, data)
+                codebase.save_resource(resource)
+
+    # def vet_resource(self, resource, categories, **kwargs):
+    def vet_resource(self, resource, categories, data, **kwargs):
+        # ==================================================
+        # Next: try to read from list inside python_rules_01.py instead of the list below
+        # ==================================================
+        # new_rules = [
+        #     {
+        #         'name': 'cpp_files_01',
+        #         "test": (all(extension == resource.extension for extension in [".cpp"]) &
+        #                     any(file_type == resource.file_type for file_type in ["C source, ASCII text", "C++ source, ASCII text"])),
+        #         'domain': 'general',
+        #         'status': 'core'
+        #     },
+        #     {
+        #         'name': 'map_files_01',
+        #         "test": (any(extension == resource.extension for extension in [".map"]) &
+        #                     any(file_type == resource.file_type for file_type in ["ASCII text"])),
+        #         'domain': 'general',
+        #         'status': 'non-core'
+        #     },
+        #     {
+        #         'name': 'c_files_01',
+        #         "test": (all(resource.extension == extension for extension in [".c"]) &
+        #                     any(resource.file_type == file_type for file_type in ["C source, ASCII text"])),
+        #         'domain': 'general',
+        #         'status': 'core'
+        #     },
+        # ]
+
+        # # # with open('/c/code/nexb/dev/scancode-toolkit/plugins/scancode-categories/src/python_rules/python_rules_01.py', 'r') as f:
+        # # with open('../python_rules/python_rules_01.py', 'r') as f:
+        # #     new_rules = [line for line in f]
+
+        # new_path = os.path.abspath(os.path.dirname(__file__))
+        # new_yamlRules = os.path.join(new_path, '../python_rules/python_rules_01.py')
+        # with open(new_yamlRules) as f:
+        #     new_rules = [line for line in f]
+        # # # #     # use safe_load instead load
+        # # # #     # rules_map = yaml.safe_load(f)
+        # # #     new_rules_map = saneyaml.load(f)
+
+        # from python_rules_01 import new_rules
+
+        # import sys
+        # import os
+        # sys.path.append(os.path.abspath("../../python_rules"))
+        # sys.path.append(os.path.abspath("/c/code/nexb/dev/scancode-toolkit/plugins/scancode-categories/src/python_rules/python_rules_01.py"))
+        # sys.path.append(os.path.abspath("/c/code/nexb/dev/scancode-toolkit/plugins/scancode-categories/src/python_rules"))
+
+        # all these fail: no module named x
+        # from python_rules_01 import new_rules
+        # from test_rules import new_rules
+        # import test_rules
+
+
+        # use list located in this same file
+        # now we get it fropm a .py file in the same directory
+        matched_rules = []
+        resource.category = matched_rules
+        # for i in new_rules:
+        for i in data["new_rules"]:
+            # # if i["test"]:
+            # convert_to_code = i["test"]
+            # if convert_to_code:
+            # if exec(i["test"]):
+            # execute = exec(i["test"])
+            # execute = exec(i["test"].decode("utf-8"))
+
+            # decode_value = i["test"].decode("utf-8")
+            # decode_value = i["test"]  # no longer necessary -- see <if eval(i["test"], scope):> below
+
+            # execute = exec(decode_value)
+            # if execute:
+            # if exec(decode_value):
+            scope = locals()  # this captures all local variables, which we pass in eval() just below so the json "test" value can use "resource"!
+            # if eval(decode_value, scope):
+            if eval(i["test"], scope):
+                # with the tests cleaned up, not sure if the 1st part of this if/else is still necessary -- needs further testing
+                if resource.type == 'directory':
+                    resource.category = 'directory'
+                elif resource.type == 'file':
+                    matched_rules.append(OrderedDict((k, i[k]) for k in ('name', 'test', 'domain', 'status')))
+                    resource.category = matched_rules
+
+        if not resource.category:
+            if resource.type == 'directory':
+                resource.category = 'directory'
+            else:
+                resource.category = "no match"
+
+        return resource