Analyse dumps from JoeSandbox karton. Remove unused old drakrun-prod. (…

…#9) * Analyse dumps from JoeSandbox karton. Remove unused old drakrun-prod * fix black format * Remove outdated filter, change analyze_dumps function signature * fix flake8
CERT-Polska · Apr 7, 2021 · ab6ad4e · ab6ad4e
1 parent c0eb0bd
commit ab6ad4e
Show file tree

Hide file tree

Showing 3 changed files with 102 additions and 54 deletions.
diff --git a/README.md b/README.md
@@ -28,11 +28,11 @@ Extracts static configuration from samples and memory dumps using the malduck en
 },
 {
     "type": "analysis",
-    "kind": "drakrun-prod"
+    "kind": "drakrun"
 },
 {
     "type": "analysis",
-    "kind": "drakrun"
+    "kind": "joesandbox"
 }
 ```
 

diff --git a/karton/config_extractor/config_extractor.py b/karton/config_extractor/config_extractor.py
@@ -4,13 +4,18 @@
 import json
 import os
 import re
+import tempfile
+import zipfile
+from collections import namedtuple
 
 from karton.core import Config, Karton, Resource, Task
 from karton.core.resource import ResourceBase
 from malduck.extractor import ExtractManager, ExtractorModules
 
 from .__version__ import __version__
 
+DumpInfo = namedtuple("DumpInfo", ("path", "base"))
+
 
 class AnalysisExtractManager(ExtractManager):
     """
@@ -53,8 +58,8 @@ class ConfigExtractor(Karton):
             "kind": "runnable",
             "platform": "linux",
         },
-        {"type": "analysis", "kind": "drakrun-prod"},
         {"type": "analysis", "kind": "drakrun"},
+        {"type": "analysis", "kind": "joesandbox"},
     ]
 
     @classmethod
@@ -133,52 +138,52 @@ def analyze_sample(self, sample: ResourceBase) -> None:
         else:
             self.log.info("Failed to get config")
 
-    # analyze a drakrun analysis
-    def analyze_drakrun(self, sample, path):
+    def analyze_dumps(self, sample, dump_infos):
+        """
+        Analyse multiple dumps from given sample. There can be more than one
+        dump from which we managed to extract config from – try to find the best
+        candidate for each family.
+        """
         extractor = create_extractor(self)
-        dumps_path = os.path.join(path, "dumps")
         dump_candidates = {}
 
         results = {
             "analysed": 0,
             "crashed": 0,
         }
 
-        analysis_dumps = sorted(os.listdir(dumps_path))
-        for i, dump in enumerate(analysis_dumps):
-            # catch only dumps
-            if re.match(r"^[a-f0-9]{4,16}_[a-f0-9]{16}$", dump):
-                results["analysed"] += 1
-                self.log.debug(
-                    "Analyzing dump %d/%d %s", i, len(analysis_dumps), str(dump)
-                )
-                dump_path = os.path.join(dumps_path, dump)
-
-                with open(dump_path, "rb") as f:
-                    dump_data = f.read()
-
-                if not dump_data:
-                    self.log.warning("Dump {} is empty".format(dump))
-                    continue
-
-                base = int(dump.split("_")[0], 16)
+        for i, dump_info in enumerate(dump_infos):
+            dump_basename = os.path.basename(dump_info.path)
+            results["analysed"] += 1
+            self.log.debug(
+                "Analyzing dump %d/%d %s", i, len(dump_infos), str(dump_basename)
+            )
 
-                try:
-                    family = extractor.push_file(dump_path, base=base)
-                    if family:
-                        self.log.info("Found better %s config in %s", family, dump)
-                        dump_candidates[family] = (dump, dump_data)
-                except Exception:
-                    self.log.exception("Error while extracting from {}".format(dump))
-                    results["crashed"] += 1
+            with open(dump_info.path, "rb") as f:
+                dump_data = f.read()
+
+            if not dump_data:
+                self.log.warning("Dump {} is empty".format(dump_basename))
+                continue
+
+            try:
+                family = extractor.push_file(dump_info.path, base=dump_info.base)
+                if family:
+                    self.log.info("Found better %s config in %s", family, dump_basename)
+                    dump_candidates[family] = (dump_basename, dump_data)
+            except Exception:
+                self.log.exception(
+                    "Error while extracting from {}".format(dump_basename)
+                )
+                results["crashed"] += 1
 
-                self.log.debug("Finished analysing dump no. %d", i)
+            self.log.debug("Finished analysing dump no. %d", i)
 
         self.log.info("Merging and reporting extracted configs")
         for family, config in extractor.configs.items():
-            dump, dump_data = dump_candidates[family]
-            self.log.info("* (%s) %s => %s", family, dump, json.dumps(config))
-            parent = Resource(name=dump, content=dump_data)
+            dump_basename, dump_data = dump_candidates[family]
+            self.log.info("* (%s) %s => %s", family, dump_basename, json.dumps(config))
+            parent = Resource(name=dump_basename, content=dump_data)
             task = Task(
                 {
                     "type": "sample",
@@ -198,37 +203,80 @@ def analyze_drakrun(self, sample, path):
 
         self.log.info("done analysing, results: {}".format(json.dumps(results)))
 
+    def get_base_from_drakrun_dump(self, dump_name):
+        """
+        Drakrun dumps come in form: <base>_<hash> e.g. 405000_688f58c58d798ecb,
+        that can be read as a dump from address 0x405000 with a content hash
+        equal to 688f58c58d798ecb.
+        """
+        return int(dump_name.split("_")[0], 16)
+
+    def analyze_drakrun(self, sample, dumps):
+        with dumps.extract_temporary() as tmpdir:  # type: ignore
+            dumps_path = os.path.join(tmpdir, "dumps")
+            dump_infos = []
+            for fname in os.listdir(dumps_path):
+                # Drakrun stores meta information in seperate file for each dump.
+                # Filter it as we want to analyse only dumps.
+                if not re.match(r"^[a-f0-9]{4,16}_[a-f0-9]{16}$", fname):
+                    continue
+                dump_path = os.path.join(dumps_path, fname)
+                dump_base = self.get_base_from_drakrun_dump(fname)
+                dump_infos.append(DumpInfo(path=dump_path, base=dump_base))
+            self.analyze_dumps(sample, dump_infos)
+
+    def get_base_from_joesandbox_dump(self, dump_name):
+        """
+        JoeSandbox dumps come in three formats:
+        1) raw dumps with .sdmp extension, e.g.
+            00000002.00000003.385533966.003C0000.00000004.00000001.sdmp
+        2) dumps that start with 0x4d5a bytes
+            2.1) unmodified with .raw.unpack extension, e.g.
+                0.0.tmpi0shwswy.exe.1290000.0.raw.unpack
+            2.2) modified by joesandbox engine with .unpack extension, e.g.
+                0.0.tmpi0shwswy.exe.1290000.0.unpack
+        """
+        if "sdmp" in dump_name:
+            return int(dump_name.split(".")[3], 16)
+        elif "raw.unpack" in dump_name:
+            return int(dump_name.split(".")[4], 16)
+        elif "unpack" in dump_name:
+            return int(dump_name.split(".")[4], 16)
+
+    def analyze_joesandbox(self, sample, dumps):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            dumpsf = os.path.join(tmpdir, "dumps.zip")
+            dumps.download_to_file(dumpsf)
+            zipf = zipfile.ZipFile(dumpsf)
+            dumps_path = tmpdir + "/dumps"
+            zipf.extractall(dumps_path, pwd=b"infected")
+            dump_infos = []
+            for fname in os.listdir(dumps_path):
+                dump_path = os.path.join(dumps_path, fname)
+                dump_base = self.get_base_from_joesandbox_dump(fname)
+                dump_infos.append(DumpInfo(path=dump_path, base=dump_base))
+            self.analyze_dumps(sample, dump_infos)
+
     def process(self, task: Task) -> None:  # type: ignore
         sample = task.get_resource("sample")
         headers = task.headers
 
         if headers["type"] == "sample":
             self.log.info("Analyzing original binary")
             self.analyze_sample(sample)
-        elif headers["type"] == "analysis" and headers["kind"] == "drakrun-prod":
-            analysis = task.get_resource("analysis")
-            if analysis.size > 1024 * 1024 * 128:
-
-                self.log.info("Analysis is too large, aborting")
-                return
-
-            with analysis.extract_temporary() as fpath:  # type: ignore
-                with open(os.path.join(fpath, "sample.txt"), "r") as f:
-                    sample_hash = f.read()
-
-                self.log.info(
-                    "Processing drakmon analysis, sample: {}".format(sample_hash)
-                )
-                self.analyze_drakrun(sample, fpath)
         elif headers["type"] == "analysis" and headers["kind"] == "drakrun":
             # DRAKVUF Sandbox (codename: drakmon OSS)
             sample_hash = hashlib.sha256(sample.content or b"").hexdigest()
             self.log.info(
                 "Processing drakmon OSS analysis, sample: {}".format(sample_hash)
             )
             dumps = task.get_resource("dumps.zip")
-            with dumps.extract_temporary() as tmpdir:  # type: ignore
-                self.analyze_drakrun(sample, tmpdir)
+            self.analyze_drakrun(sample, dumps)
+        elif headers["type"] == "analysis" and headers["kind"] == "joesandbox":
+            sample_hash = hashlib.sha256(sample.content or b"").hexdigest()
+            self.log.info(f"Processing joesandbox analysis, sample: {sample_hash}")
+            dumps = task.get_resource("dumps.zip")
+            self.analyze_joesandbox(sample, dumps)
 
         self.log.debug("Printing gc stats")
         self.log.debug(gc.get_stats())
diff --git a/requirements.txt b/requirements.txt
@@ -1,2 +1,2 @@
-karton.core==4.0.5
+karton-core==4.2.0
 malduck==4.1.0