Merge pull request #3 from wrobstory/PR_daskify

PR Daskify
wrobstory · Aug 14, 2015 · fc4a114 · fc4a114
2 parents d03519b + 363df72
commit fc4a114
Show file tree

Hide file tree

Showing 19 changed files with 319 additions and 80 deletions.
diff --git a/README.md b/README.md
@@ -23,7 +23,7 @@ Because for (mostly) structured documents where we're expecting the schema to ra
 
 How
 ------
-Malort will read through a directory of .json or flat text files with delimited JSON blobs and generate relevant statistics on each key.
+Malort will read through a directory of .json or flat text files (optionally gzipped) with delimited JSON blobs and generate relevant statistics on each key. It uses the Dask libary to parallelize these computations.
 
 For example, let's look at a directory with two JSON files, and one text file with newline-delimited JSON:
 ```json
@@ -109,18 +109,16 @@ Install
 
 API
 ---
-* `result = malort.analyze(path, delimiter='\n', parse_timestamps=True)`
+* `result = malort.analyze(path, parse_timestamps=True)`
 
 ```python
-Analyze a given directory of either .json or flat text files
-with delimited JSON to get relevant key statistics.
+Analyze a given directory of either .json, flat text files
+with newline-delimited JSON, or gzipped files with newline-delimted JSON to get relevant key statistics.
 
 Parameters
 ----------
 path: string
     Path to directory
-delimiter: string, default newline
-    For flat text files, the JSON blob delimiter
 parse_timestamps: boolean, default True
     If True, will attempt to regex match ISO8601 formatted parse_timestamps
 ```
@@ -158,7 +156,7 @@ Yes.
 
 How fast is it?
 ---------------
-With timestamp parsing turned on, I used Malort to process 2.1 GB of files (1,326,794 nested JSON blobs) in 8 minutes. There are undoubtedly ways to do it faster.
+With timestamp parsing turned on, I used Malort to process 2.1 GB of files (1,326,794 nested JSON blobs) in 8 minutes. There are undoubtedly ways to do it faster. Speed will depend on a number of factors, including nesting depth.
 
 Should I use the column type results verbatim?
 ----------------------------------------------

diff --git a/malort/core.py b/malort/core.py
@@ -6,23 +6,24 @@
 JSON -> Postgres Column types
 
 """
-from __future__ import absolute_import
-from __future__ import print_function
-from __future__ import division
+from __future__ import absolute_import, print_function, division
 
 from collections import defaultdict
+from functools import partial
 import json
 import os
 from os.path import isfile, join, splitext
 import random
 import re
 import time
 
-from malort.stats import recur_dict, dict_generator
+import dask.bag as db
+
+from malort.stats import recur_dict, combine_stats, dict_generator
 from malort.type_mappers import TypeMappers
 
 
-def analyze(path, delimiter='\n', parse_timestamps=True, **kwargs):
+def analyze(path, parse_timestamps=True, **kwargs):
     """
     Analyze a given directory of either .json or flat text files
     with delimited JSON to get relevant key statistics.
@@ -31,19 +32,21 @@ def analyze(path, delimiter='\n', parse_timestamps=True, **kwargs):
     ----------
     path: string
         Path to directory
-    delimiter: string, default newline
-        For flat text files, the JSON blob delimiter
     parse_timestamps: boolean, default True
         If True, will attempt to regex match ISO8601 formatted parse_timestamps
-    kwargs: 
-        passed into json.loads. Here you can specify encoding, etc. 
+    kwargs:
+        passed into json.loads. Here you can specify encoding, etc.
     """
 
     stats = {}
 
     start_time = time.time()
-    for count, blob in enumerate(dict_generator(path, delimiter, **kwargs), start=1):
-        recur_dict(blob, stats, parse_timestamps=parse_timestamps)
+    file_list = [os.path.join(path, f) for f in os.listdir(path)]
+    bag = db.from_filenames(file_list).map(json.loads)
+    recur_partial = partial(recur_dict, parse_timestamps=parse_timestamps)
+    stats = bag.fold(recur_partial, combine_stats, initial={}).compute()
+    count = stats["total_records"]
+    del stats["total_records"]
 
     elapsed = time.time() - start_time
     print('Malort run finished: {} JSON blobs analyzed in {} seconds.'

diff --git a/malort/stats.py b/malort/stats.py
@@ -6,9 +6,7 @@
 Functions to generate Malort stats
 
 """
-from __future__ import absolute_import
-from __future__ import print_function
-from __future__ import division
+from __future__ import absolute_import, print_function, division
 
 import decimal
 import json
@@ -25,6 +23,7 @@
                       \d([\.,]\d+)?)?([zZ]|([\+-])([01]\d|2[0-3]):?([0-5]\d)?)
                       ?)?)?$""", re.VERBOSE)
 
+
 def delimited(file, delimiter='\n', bufsize=4096):
     buf = ''
     while True:
@@ -38,6 +37,7 @@ def delimited(file, delimiter='\n', bufsize=4096):
             yield line
         buf = lines[-1]
 
+
 def catch_json_error(blob, filepath, **kwargs):
     """Wrapper to provide better error message for JSON reads"""
     try:
@@ -48,6 +48,7 @@ def catch_json_error(blob, filepath, **kwargs):
 
     return parsed
 
+
 def dict_generator(path, delimiter='\n', **kwargs):
     """
     Given a directory path, return a generator that will return a dict for each
@@ -80,11 +81,109 @@ def dict_generator(path, delimiter='\n', **kwargs):
                 else:
                     yield catch_json_error(fread.read(), filepath, **kwargs)
 
+
 def get_new_mean(value, current_mean, count):
     """Given a value, current mean, and count, return new mean"""
     summed = current_mean * count
     return (summed + value)/(count + 1)
 
+
+def combine_means(means, counts):
+    """Combine ordered iter of means and counts"""
+    numer = sum([mean * count for mean, count in zip(means, counts)
+                 if mean is not None and count is not None])
+    denom = sum([c for c in counts if c is not None])
+    return round(numer / denom, 3)
+
+
+def combine_stats(accum, value):
+    """
+    Combine two sets of stats into one. Used for final dask rollup of
+    multiple partitions of stats dicts into one unified stats dict. Best
+    thought of as a reduction over multiple stats object.
+
+    Parameters
+    ----------
+    accum: dict
+        Accumlating tats dict
+    value: dict
+        New stats dict to merge with accumulator
+
+    Returns
+    -------
+    dict
+    """
+
+    for field_name, type_stats in value.items():
+
+        # Update total count
+        if field_name == "total_records":
+            accum["total_records"] = accum["total_records"] + type_stats
+            continue
+
+        # Combine accum stats from different branches
+        if accum.get(field_name):
+            for value_type, val_stats in type_stats.items():
+                accum_entry = accum[field_name].get(value_type)
+                if not accum_entry:
+                    # If accum doesn't already have an entry, but the
+                    # value does, continue
+                    accum[field_name][value_type] = val_stats
+                    continue
+
+                # base_key is not a statistic to be updated
+                if value_type == "base_key":
+                    continue
+
+                if accum_entry:
+                    max_ = (accum_entry.get("max"), val_stats.get("max"))
+                    min_ = (accum_entry.get("min"), val_stats.get("min"))
+                    count = (accum_entry.get("count"), val_stats.get("count"))
+                    mean = (accum_entry.get("mean"), val_stats.get("mean"))
+
+                    if any(max_):
+                        accum_entry["max"] = max(max_)
+                    if any(min_):
+                        accum_entry["min"] = min(min_)
+                    if any(count):
+                        accum_entry["count"] = sum([c for c in count
+                                                    if c is not None])
+                    if any(mean):
+                        accum_entry["mean"] = combine_means(mean, count)
+
+                    # Type specific entries
+                    if value_type  == "float":
+
+                        fixed_length = (accum_entry.get("fixed_length"),
+                                        val_stats.get("fixed_length"))
+
+                        # Decimals not fixed length if prec/scale do not match
+                        a_prec, a_scale = (accum_entry.get("max_precision"),
+                                           accum_entry.get("max_scale"))
+                        v_prec, v_scale = (val_stats.get("max_precision"),
+                                           val_stats.get("max_scale"))
+                        prec_scale_eq = (a_prec == v_prec, a_scale == v_scale)
+
+                        if not all(fixed_length) or not all(prec_scale_eq):
+                            accum_entry["fixed_length"] = False
+
+                        max_prec = (accum_entry.get("max_precision"),
+                                    val_stats.get("max_precision"))
+                        accum_entry["max_precision"] = max(max_prec)
+
+                        max_scale = (accum_entry.get("max_scale"),
+                                     val_stats.get("max_scale"))
+                        accum_entry["max_scale"] = max(max_scale)
+
+                    elif value_type == "str":
+                        samples = accum_entry.get("sample", []) +\
+                                  val_stats.get("sample", [])
+                        accum_entry["sample"] = random.sample(
+                            samples, min(len(samples), 3))
+
+    return accum
+
+
 def updated_entry_stats(value, current_stats, parse_timestamps=True):
     """
     Given a value and a dict of current statistics, return a dict of new
@@ -156,7 +255,7 @@ def updated_entry_stats(value, current_stats, parse_timestamps=True):
     return value_type, new_stats
 
 
-def recur_dict(value, stats, parent=None, **kwargs):
+def recur_dict(stats, value, parent=None, **kwargs):
     """
     Recurse through a dict `value` and update `stats` for each field.
     Can handle nested dicts, lists of dicts, and lists of values (must be
@@ -170,9 +269,12 @@ def recur_dict(value, stats, parent=None, **kwargs):
         Parent key to get key nesting depth.
     kwargs: Options for update_entry_stats
     """
-
     parent = parent or ''
 
+    if parent == '':
+        total_records = stats.get("total_records")
+        stats["total_records"] = (total_records + 1) if total_records else 1
+
     def update_stats(current_val, nested_path, base_key):
         "Updater function"
         if nested_path not in stats:
@@ -187,14 +289,14 @@ def update_stats(current_val, nested_path, base_key):
         for k, v in value.items():
             parent_path = '.'.join([parent, k]) if parent != '' else k
             if isinstance(v, (list, dict)):
-                recur_dict(v, stats, parent_path)
+                recur_dict(stats, v, parent_path)
             else:
                 update_stats(v, parent_path, k)
 
     elif isinstance(value, list):
         for v in value:
             if isinstance(v, (list, dict)):
-                recur_dict(v, stats, parent)
+                recur_dict(stats, v, parent)
             else:
                 base_key = parent.split(".")[-1]
                 update_stats(json.dumps(value), parent, base_key)

diff --git a/malort/test_helpers.py b/malort/test_helpers.py
@@ -12,16 +12,20 @@
 TEST_FILES_1 = os.path.normpath(os.path.join(os.path.abspath(__file__),
                                 '..', 'tests', 'test_files'))
 TEST_FILES_2 = os.path.normpath(os.path.join(os.path.abspath(__file__),
-                                '..', 'tests', 'test_files_delimited'))
+                                '..', 'tests', 'test_files_newline_delimited'))
 TEST_FILES_3 = os.path.normpath(os.path.join(os.path.abspath(__file__),
                                 '..', 'tests', 'test_files_nested'))
-
+TEST_FILES_4 = os.path.normpath(os.path.join(os.path.abspath(__file__),
+                                '..', 'tests', 'test_files_mult_type'))
 
 class TestHelpers(unittest.TestCase):
 
     def assert_stats(self, result, expected):
         """Test helper for testing stats results"""
         for key, value in result.items():
+            if key == 'total_records':
+                self.assertEqual(expected['total_records'], value)
+                continue
             for typek, typev in value.items():
                 if typek == 'str':
                     for k, v in typev.items():

diff --git a/malort/tests/test_files/test1.json b/malort/tests/test_files/test1.json
@@ -1,5 +1 @@
-{"intfield": 5,
- "floatfield": 2.345,
- "datefield": "2014-09-26 17:00:00",
- "charfield": "fixedlength",
- "varcharfield": "var"}
+{"intfield": 5,"floatfield": 2.345,"datefield": "2014-09-26 17:00:00","charfield": "fixedlength","varcharfield": "var"}
diff --git a/malort/tests/test_files/test2.json b/malort/tests/test_files/test2.json
@@ -1,5 +1 @@
-{"intfield": 10,
- "floatfield": 4.7891,
- "datefield": "2014-09-26 17:00:00",
- "charfield": "fixedlength",
- "varcharfield": "varyin"}
+{"intfield": 10,"floatfield": 4.7891,"datefield": "2014-09-26 17:00:00","charfield": "fixedlength","varcharfield": "varyin"}
diff --git a/malort/tests/test_files/test3.json b/malort/tests/test_files/test3.json
@@ -0,0 +1 @@
+{"intfield": 15,"floatfield": 3.0012,"charfield": "fixedlength","varcharfield": "varyingle","datefield": "2014-09-26 17:00:00"}
diff --git a/malort/tests/test_files/test4.json b/malort/tests/test_files/test4.json
@@ -0,0 +1 @@
+{"intfield": 20,"floatfield": 10.8392,"charfield": "fixedlength","varcharfield": "varyinglengt","datefield": "2014-09-26 17:00:00"}
diff --git a/malort/tests/test_files_delimited/test_pipe_delimited_1 b/malort/tests/test_files_delimited/test_pipe_delimited_1
diff --git a/malort/tests/test_files_delimited/test_pipe_delimited_2 b/malort/tests/test_files_delimited/test_pipe_delimited_2
diff --git a/malort/tests/test_files_mult_type/test_mult_delimited_1 b/malort/tests/test_files_mult_type/test_mult_delimited_1
@@ -0,0 +1,2 @@
+{"foo": 10,"bar": 2.0,"baz": "fixed","qux": 10}
+{"foo": "foo","bar": true,"baz": 2,"qux": "varyin"}
diff --git a/malort/tests/test_files_mult_type/test_mult_delimited_2 b/malort/tests/test_files_mult_type/test_mult_delimited_2
@@ -0,0 +1,2 @@
+{"foo": 1000,"bar": "bar","baz": "fixed","qux": "var"}
+{"foo": "foo","bar": 4.0,"baz": 1,"qux": "varyingle"}
diff --git a/malort/tests/test_files/test3delimited → ...les_newline_delimited/test_nl_delimited_1 b/malort/tests/test_files/test3delimited → ...les_newline_delimited/test_nl_delimited_1
diff --git a/malort/tests/test_files_newline_delimited/test_nl_delimited_2 b/malort/tests/test_files_newline_delimited/test_nl_delimited_2
@@ -0,0 +1,2 @@
+{"intfield": 5,"floatfield": 2.345,"datefield": "2014-09-26 17:00:00","charfield": "fixedlength","varcharfield": "var"}
+{"intfield": 10,"floatfield": 4.7891,"datefield": "2014-09-26 17:00:00","charfield": "fixedlength","varcharfield": "varyin"}