decalage2 · christian-intra2net · Jun 1, 2022 · Jun 13, 2022 · Jun 13, 2022 · Jun 13, 2022
diff --git a/oletools/common/io_encoding.py b/oletools/common/io_encoding.py
@@ -7,7 +7,7 @@
 or unusual language settings.
 
 In such settings, output to console falls back to ASCII-only. Also open()
-suddenly fails to interprete non-ASCII characters.
+suddenly fails to interpret non-ASCII characters.
 
 Therefore, at start of scripts can run :py:meth:`ensure_stdout_handles_unicode`
 and when opening text files use :py:meth:`uopen` to replace :py:meth:`open`.
@@ -18,7 +18,7 @@
 
 # === LICENSE =================================================================
 
-# msodde is copyright (c) 2017-2018 Philippe Lagadec (http://www.decalage.info)
+# io_encoding is copyright (c) 2017-2018 Philippe Lagadec (http://www.decalage.info)
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without

diff --git a/oletools/msodde.py b/oletools/msodde.py
@@ -74,7 +74,6 @@
 from oletools import rtfobj
 from oletools.ppt_record_parser import is_ppt
 from oletools import crypto
-from oletools.common.io_encoding import ensure_stdout_handles_unicode
 from oletools.common.log_helper import log_helper
 
 # -----------------------------------------------------------------------------

diff --git a/oletools/oleobj.py b/oletools/oleobj.py
diff --git a/oletools/olevba.py b/oletools/olevba.py
@@ -341,7 +341,6 @@
 from oletools import oleform
 from oletools import rtfobj
 from oletools import crypto
-from oletools.common.io_encoding import ensure_stdout_handles_unicode
 from oletools.common import codepages
 from oletools import ftguess
 from oletools.common.log_helper import log_helper

diff --git a/tests/common/test_json.py b/tests/common/test_json.py
@@ -0,0 +1,44 @@
+"""
+Test that all --json output is always valid json.
+
+Since this test takes rather long, it is not included in regular unittest runs.
+To enable it, set environment variable OLETOOLS_TEST_JSON to value "1"
+"""
+
+import os
+from os.path import relpath
+import json
+import unittest
+
+from tests.test_utils import DATA_BASE_DIR, call_and_capture
+from tests.test_utils.testdata_reader import loop_and_extract
+
+
+@unittest.skipIf('OLETOOLS_TEST_JSON' not in os.environ or os.environ['OLETOOLS_TEST_JSON'] != '1',
+                 'Test takes pretty long, do not include in regular test runs')
+class TestJson(unittest.TestCase):
+    """Test that all --json output is always valid json."""
+
+    def test_all(self):
+        """Check that olevba, msodde and oleobj produce valid json for ALL samples."""
+        for sample_path in loop_and_extract():
+            if sample_path.startswith(DATA_BASE_DIR):
+                print(f'TestJson: checking sample {relpath(sample_path, DATA_BASE_DIR)}')
+            else:
+                print(f'TestJson: checking sample {sample_path}')
+            output, _ = call_and_capture('oleobj', ['--json', '--nodump', sample_path],
+                                         accept_nonzero_exit=True)
+            json.loads(output)
+
+            output, _ = call_and_capture('olevba', ['--json', sample_path],
+                                         accept_nonzero_exit=True)
+            json.loads(output)
+
+            output, _ = call_and_capture('msodde', ['--json', sample_path],
+                                         accept_nonzero_exit=True)
+            json.loads(output)
+
+
+# just in case somebody calls this file as a script
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/oleobj/test_basic.py b/tests/oleobj/test_basic.py
@@ -3,6 +3,7 @@
 import unittest
 from tempfile import mkdtemp
 from shutil import rmtree
+from os import listdir
 from os.path import join, isfile
 from hashlib import md5
 from glob import glob
@@ -68,16 +69,16 @@ def preread_file(args):
         raise ValueError('ignore_arg not as expected!')
     with open(filename, 'rb') as file_handle:
         data = file_handle.read()
-    err_stream, err_dumping, did_dump = \
+    err_stream, err_dumping, did_dump, found_external = \
         oleobj.process_file(filename, data, output_dir=output_dir)
-    if did_dump and not err_stream and not err_dumping:
+    if did_dump and not err_stream and not err_dumping and not found_external:
         return oleobj.RETURN_DID_DUMP
     else:
-        return oleobj.RETURN_NO_DUMP   # just anything else
+        return oleobj.RETURN_NO_DUMP   # just anything else, will cause error
 
 
 class TestOleObj(unittest.TestCase):
-    """ Tests oleobj basic feature """
+    """Tests oleobj basic feature: dump embedded content."""
 
     def setUp(self):
         """ fixture start: create temp dir """
@@ -158,6 +159,17 @@ def test_non_streamed(self):
         return self.do_test_md5(['-d', self.temp_dir], test_fun=preread_file,
                                 only_run_every=4)
 
+    def test_nodump(self):
+        """Ensure that with --nodump nothing is ever written to disc."""
+        data_dir = join(DATA_BASE_DIR, 'oleobj')
+        for sample_name, _, _ in SAMPLES:
+            args = ['-d', self.temp_dir, '--nodump', join(data_dir, sample_name)]
+            call_and_capture('oleobj', args,
+                             accept_nonzero_exit=True)
+        temp_dir_contents = listdir(self.temp_dir)
+        if temp_dir_contents:
+            self.fail('Found file in temp dir despite "--nodump": {}'.format(temp_dir_contents))
+
 
 class TestSaneFilenameCreation(unittest.TestCase):
     """ Test sanitization / creation of sane filenames """

diff --git a/tests/oleobj/test_external_links.py b/tests/oleobj/test_external_links.py
@@ -21,12 +21,20 @@ def test_external_links(self):
         for dirpath, _, filenames in os.walk(BASE_DIR):
             for filename in filenames:
                 file_path = path.join(dirpath, filename)
+                if not path.isfile(file_path):
+                    continue
 
-                output, ret_val = call_and_capture('oleobj', [file_path, ],
+                output, ret_val = call_and_capture('oleobj', ['--nodump', file_path, ],
                                                    accept_nonzero_exit=True)
-                self.assertEqual(ret_val, oleobj.RETURN_DID_DUMP,
+                self.assertEqual(ret_val, oleobj.RETURN_FOUND_EXTERNAL,
                                  msg='Wrong return value {} for {}. Output:\n{}'
                                      .format(ret_val, filename, output))
+                found_relationship = False
+                for line in output.splitlines():
+                    if line.startswith('Found relationship'):
+                        found_relationship = True
+                        break
+                self.assertTrue(found_relationship)
 
 
 # just in case somebody calls this file as a script

diff --git a/tests/test_utils/testdata_reader.py b/tests/test_utils/testdata_reader.py
@@ -7,9 +7,9 @@
 """
 
 import os, sys, zipfile
-from os.path import relpath, join, isfile
+from os.path import relpath, join, isfile, splitext
 from contextlib import contextmanager
-from tempfile import mkstemp
+from tempfile import mkstemp, TemporaryDirectory, NamedTemporaryFile
 
 from . import DATA_BASE_DIR
 
@@ -73,11 +73,10 @@ def loop_over_files(subdir=''):
     and the contents of the file, with the file being unzipped first if it ends
     with .zip.
 
-    :arg str subdir: Optional subdir of test data dir that caller is interested
-                     in
-    """
-    # create temp dir to extract files into
+    See also: :py:meth:`loop_and_extract`
 
+    :param str subdir: Optional subdir of test data dir that caller is interested in
+    """
     for base_dir, _, files in os.walk(join(DATA_BASE_DIR, subdir)):
         for filename in files:
             relative_path = relpath(join(base_dir, filename), DATA_BASE_DIR)
@@ -87,6 +86,41 @@ def loop_over_files(subdir=''):
                 yield relative_path, read(relative_path)
 
 
+def loop_and_extract(subdir=''):
+    """
+    Find all files, decrypting them to tempdir if necessary.
+
+    Does a `os.walk` through all test data or the given subdir and yields
+    the absolute path for each sample, which is either its original location
+    in `DATA_BASE_DIR` or in a temporary directory if it had to be decrypted.
+
+    The temp dir and files inside it are always deleted right after usage.
+
+    See also: :py:meth:`loop_over_files`
+
+    :param str subdir: Optional subdir of test data dir that caller is interested in
+    """
+    with TemporaryDirectory(prefix='oletools-test-') as temp_dir:
+        for base_dir, _, files in os.walk(join(DATA_BASE_DIR, subdir)):
+            for filename in files:
+                full_path = join(base_dir, filename)
+                if filename.endswith('.zip'):
+                    # remove the ".zip" and split the rest into actual name and extension
+                    actual_name, actual_extn = splitext(splitext(filename)[0])
+
+                    with zipfile.ZipFile(full_path, 'r') as zip_file:
+                        # create a temp file that has a proper file name and is deleted on closing
+                        with NamedTemporaryFile(dir=temp_dir, prefix=actual_name, suffix=actual_extn) \
+                                as temp_file:
+                            # our test samples are not big, so we can read the whole thing at once
+                            temp_file.write(zip_file.read(zip_file.namelist()[0],
+                                                          pwd=ENCRYPTED_FILES_PASSWORD))
+                            temp_file.flush()
+                            yield temp_file.name
+                else:
+                    yield full_path
+
+
 @contextmanager
 def decrypt_sample(relpath):
     """

diff --git a/tests/test_utils/utils.py b/tests/test_utils/utils.py
@@ -29,15 +29,15 @@ def call_and_capture(module, args=None, accept_nonzero_exit=False,
     Only drawback sofar: stdout and stderr are merged into one (which is
     what users see on their shell as well). When testing for json-compatible
     output you should `exclude_stderr` to `False` since logging ignores stderr,
-    so unforseen warnings (e.g. issued by pypy) would mess up your json.
+    so unforeseen warnings (e.g. issued by pypy) would mess up your json.
 
     :param str module: name of module to test, e.g. `olevba`
     :param args: arguments for module's main function
-    :param bool fail_nonzero: Raise error if command returns non-0 return code
+    :param bool accept_nonzero_exit: Do not raise error if command returns non-0 return code
     :param bool exclude_stderr: Exclude output to `sys.stderr` from output
                                 (e.g. if parsing output through json)
-    :returns: ret_code, output
-    :rtype: int, str
+    :returns: output, ret_code
+    :rtype: str, int
     """
     # create a PYTHONPATH environment var to prefer our current code
     env = os.environ.copy()
@@ -47,13 +47,6 @@ def call_and_capture(module, args=None, accept_nonzero_exit=False,
     except KeyError:
         env['PYTHONPATH'] = SOURCE_BASE_DIR
 
-    # hack: in python2 output encoding (sys.stdout.encoding) was None
-    # although sys.getdefaultencoding() and sys.getfilesystemencoding were ok
-    # TODO: maybe can remove this once branch
-    #       "encoding-for-non-unicode-environments" is merged
-    if 'PYTHONIOENCODING' not in env:
-        env['PYTHONIOENCODING'] = 'utf8'
-
     # ensure args is a tuple
     my_args = tuple(args) if args else ()