wip

duckdb · Oct 22, 2024 · ab53125 · ab53125
1 parent 02b5a94
commit ab53125
Show file tree

Hide file tree

Showing 5 changed files with 1,366 additions and 119 deletions.
diff --git a/_posts/2023-03-03-json.md b/_posts/2023-03-03-json.md
@@ -19,7 +19,7 @@ These functions are similar to the JSON functionality provided by other database
 DuckDB uses [yyjson](https://github.com/ibireme/yyjson) internally to parse JSON, a high-performance JSON library written in ANSI C. Many thanks to the yyjson authors and contributors!
 
 Besides these functions, DuckDB is now able to read JSON directly!
-This is done by automatically detecting the types and column names, then converting the values within the JSON to DuckDB's vectors. 
+This is done by automatically detecting the types and column names, then converting the values within the JSON to DuckDB's vectors.
 The automated schema detection dramatically simplifies working with JSON data and subsequent queries on DuckDB's vectors are significantly faster!
 
 ## Reading JSON Automatically with DuckDB
@@ -64,7 +64,7 @@ SELECT * FROM 'todos.json';
 Now, finding out which user completed the most TODO items is as simple as:
 
 ```sql
-SELECT userId, sum(completed::int) total_completed
+SELECT userId, sum(completed::INTEGER) AS total_completed
 FROM 'todos.json'
 GROUP BY userId
 ORDER BY total_completed DESC
@@ -111,7 +111,7 @@ This is specified with `nd` or the `lines` parameter:
 
 ```sql
 SELECT * FROM read_ndjson_auto('todos2.json');
-SELECT * FROM read_json_auto('todos2.json', lines='true');
+SELECT * FROM read_json_auto('todos2.json', lines = 'true');
 ```
 
 You can also set `lines='auto'` to auto-detect whether the JSON file is newline-delimited.
@@ -124,31 +124,30 @@ The first `json_format` is `'array_of_records'`, while the second is `'records'`
 This can be specified like so:
 
 ```sql
-SELECT * FROM read_json('todos.json', auto_detect=true, json_format='array_of_records');
-SELECT * FROM read_json('todos2.json', auto_detect=true, json_format='records');
+SELECT * FROM read_json('todos.json', format = 'array', records = true); -- ' json_format = 'array_of_records'
+SELECT * FROM read_json('todos2.json', format = 'newline_delimited', records = true); --  json_format = 'records'
 ```
 
 Other supported formats are `'values'` and `'array_of_values'`, which are similar to `'records'` and `'array_of_records'`.
 However, with these formats, each 'record' is not required to be a JSON object but can also be a JSON array, string, or anything supported in JSON.
 
 ## Manual Schemas
 
-What you may also have noticed is the `auto_detect` parameter.
-This parameter tells DuckDB to infer the schema, i.e., determine the names and types of the returned columns.
+DuckDB infers the schema, i.e., determines the names and types of the returned columns.
 These can manually be specified like so:
 
 ```sql
 SELECT * FROM read_json('todos.json',
-                        columns={userId: 'INT', id: 'INT', title: 'VARCHAR', completed: 'BOOLEAN'},
-                        json_format='array_of_records');
+                        columns = {userId: 'INT', id: 'INT', title: 'VARCHAR', completed: 'BOOLEAN'},
+                        json_format = 'array_of_records'); -- TODO: format // records 
 ```
 
 You don't have to specify all fields, just the ones you're interested in:
 
 ```sql
 SELECT * FROM read_json('todos.json',
-                        columns={userId: 'INT', completed: 'BOOLEAN'},
-                        json_format='array_of_records');
+                        columns = {userId: 'INT', completed: 'BOOLEAN'},
+                        json_format = 'array_of_records');
 ```
 
 Now that we know how to use the new DuckDB JSON table functions let's dive into some analytics!
@@ -191,9 +190,9 @@ To get a feel of what the data looks like, we run the following query:
 ```sql
 SELECT json_group_structure(json)
 FROM (
-  SELECT *
-  FROM read_ndjson_objects('gharchive_gz/*.json.gz')
-  LIMIT 2048
+    SELECT *
+    FROM read_ndjson_objects('gharchive_gz/*.json.gz')
+    LIMIT 2048
 );
 ```
 
@@ -238,7 +237,8 @@ I've left `"payload"` out because it consists of deeply nested JSON, and its for
 So, how many records are we dealing with exactly? Let's count it using DuckDB:
 
 ```sql
-SELECT count(*) count FROM 'gharchive_gz/*.json.gz';
+SELECT count(*) AS count
+FROM 'gharchive_gz/*.json.gz';
 ```
 
 | count   |
@@ -356,12 +356,12 @@ This is more activity than normal because most of the DuckDB developers were bus
 Now, let's see who was the most active:
 
 ```sql
-SELECT actor.login, count(*) count
+SELECT actor.login, count(*) AS count
 FROM events
 WHERE repo.name = 'duckdb/duckdb'
   AND type = 'PullRequestEvent'
 GROUP BY actor.login
-ORDER BY count desc
+ORDER BY count DESC
 LIMIT 5;
 ```
 
@@ -383,29 +383,29 @@ We've ignored it because the contents of this field are different based on the t
 We can see how they differ with the following query:
 
 ```sql
-SELECT json_group_structure(payload) structure
+SELECT json_group_structure(payload) AS structure
 FROM (SELECT *
-  FROM read_json(
-    'gharchive_gz/*.json.gz',
-    columns={
-      id: 'BIGINT',
-      type: 'VARCHAR',
-      actor: 'STRUCT(id UBIGINT,
-                     login VARCHAR,
-                     display_login VARCHAR,
-                     gravatar_id VARCHAR,
-                     url VARCHAR,
-                     avatar_url VARCHAR)',
-      repo: 'STRUCT(id UBIGINT, name VARCHAR, url VARCHAR)',
-      payload: 'JSON',
-      public: 'BOOLEAN',
-      created_at: 'TIMESTAMP',
-      org: 'STRUCT(id UBIGINT, login VARCHAR, gravatar_id VARCHAR, url VARCHAR, avatar_url VARCHAR)'
-    },
-    lines='true'
-  )
-  WHERE type = 'WatchEvent'
-  LIMIT 2048
+    FROM read_json(
+        'gharchive_gz/*.json.gz',
+        columns = {
+              id: 'BIGINT',
+              type: 'VARCHAR',
+              actor: 'STRUCT(id UBIGINT,
+                            login VARCHAR,
+                            display_login VARCHAR,
+                            gravatar_id VARCHAR,
+                            url VARCHAR,
+                            avatar_url VARCHAR)',
+              repo: 'STRUCT(id UBIGINT, name VARCHAR, url VARCHAR)',
+              payload: 'JSON',
+              public: 'BOOLEAN',
+              created_at: 'TIMESTAMP',
+              org: 'STRUCT(id UBIGINT, login VARCHAR, gravatar_id VARCHAR, url VARCHAR, avatar_url VARCHAR)'
+          },
+          lines = 'true'
+    )
+    WHERE type = 'WatchEvent'
+    LIMIT 2048
 );
 ```
 
@@ -491,47 +491,49 @@ Note that because we are not auto-detecting the schema, we have to supply `times
 The key `"user"` must be surrounded by quotes because it is a reserved keyword in SQL:
 
 ```sql
-CREATE TABLE pr_events as
-SELECT *
-FROM read_json(
-  'gharchive_gz/*.json.gz',
-  columns={
-    id: 'BIGINT',
-    type: 'VARCHAR',
-    actor: 'STRUCT(id UBIGINT,
-                   login VARCHAR,
-                   display_login VARCHAR,
-                   gravatar_id VARCHAR,
-                   url VARCHAR,
-                   avatar_url VARCHAR)',
-    repo: 'STRUCT(id UBIGINT, name VARCHAR, url VARCHAR)',
-    payload: 'STRUCT(
-                action VARCHAR,
-                number UBIGINT,
-                pull_request STRUCT(
-                  url VARCHAR,
-                  id UBIGINT,
-                  title VARCHAR,
-                  "user" STRUCT(
-                    login VARCHAR,
-                    id UBIGINT
-                  ),
-                  body VARCHAR,
-                  created_at TIMESTAMP,
-                  updated_at TIMESTAMP,
-                  assignee STRUCT(login VARCHAR, id UBIGINT),
-                  assignees STRUCT(login VARCHAR, id UBIGINT)[]
-                )
-              )',
-    public: 'BOOLEAN',
-    created_at: 'TIMESTAMP',
-    org: 'STRUCT(id UBIGINT, login VARCHAR, gravatar_id VARCHAR, url VARCHAR, avatar_url VARCHAR)'
-  },
-  json_format='records',
-  lines='true',
-  timestampformat='%Y-%m-%dT%H:%M:%SZ'
-)
-WHERE type = 'PullRequestEvent';
+CREATE TABLE pr_events AS
+    SELECT *
+    FROM read_json(
+        'gharchive_gz/*.json.gz',
+        columns = {
+            id: 'BIGINT',
+            type: 'VARCHAR',
+            actor: 'STRUCT(
+                          id UBIGINT,
+                          login VARCHAR,
+                          display_login VARCHAR,
+                          gravatar_id VARCHAR,
+                          url VARCHAR,
+                          avatar_url VARCHAR
+                   )',
+            repo: 'STRUCT(id UBIGINT, name VARCHAR, url VARCHAR)',
+            payload: 'STRUCT(
+                          action VARCHAR,
+                          number UBIGINT,
+                          pull_request STRUCT(
+                            url VARCHAR,
+                            id UBIGINT,
+                            title VARCHAR,
+                            "user" STRUCT(
+                              login VARCHAR,
+                              id UBIGINT
+                            ),
+                            body VARCHAR,
+                            created_at TIMESTAMP,
+                            updated_at TIMESTAMP,
+                            assignee STRUCT(login VARCHAR, id UBIGINT),
+                            assignees STRUCT(login VARCHAR, id UBIGINT)[]
+                          )
+                      )',
+            public: 'BOOLEAN',
+            created_at: 'TIMESTAMP',
+            org: 'STRUCT(id UBIGINT, login VARCHAR, gravatar_id VARCHAR, url VARCHAR, avatar_url VARCHAR)'
+        },
+        json_format = 'records',
+        lines = 'true',
+        timestampformat = '%Y-%m-%dT%H:%M:%SZ'
+    )
+    WHERE type = 'PullRequestEvent';
 ```
 
 This query completes in around 36s with an on-disk database (resulting size is 478MB) and 9s with an in-memory database.
@@ -561,13 +563,13 @@ We can check who was assigned the most:
 
 ```sql
 WITH assignees AS (
-  SELECT payload.pull_request.assignee.login assignee
-  FROM pr_events
-  UNION ALL
-  SELECT unnest(payload.pull_request.assignees).login assignee
-  FROM pr_events
+    SELECT payload.pull_request.assignee.login assignee
+    FROM pr_events
+    UNION ALL
+    SELECT unnest(payload.pull_request.assignees).login assignee
+    FROM pr_events
 )
-SELECT assignee, count(*) count
+SELECT assignee, count(*) AS count
 FROM assignees
 WHERE assignee NOT NULL
 GROUP BY assignee
@@ -596,25 +598,25 @@ If you don't want to specify the schema of a field, you can set the type as `'JS
 CREATE TABLE pr_events AS
 SELECT *
 FROM read_json(
-  'gharchive_gz/*.json.gz',
-  columns={
-    id: 'BIGINT',
-    type: 'VARCHAR',
-    actor: 'STRUCT(id UBIGINT,
-                   login VARCHAR,
-                   display_login VARCHAR,
-                   gravatar_id VARCHAR,
-                   url VARCHAR,
-                   avatar_url VARCHAR)',
-    repo: 'STRUCT(id UBIGINT, name VARCHAR, url VARCHAR)',
-    payload: 'JSON',
-    public: 'BOOLEAN',
-    created_at: 'TIMESTAMP',
-    org: 'STRUCT(id UBIGINT, login VARCHAR, gravatar_id VARCHAR, url VARCHAR, avatar_url VARCHAR)'
-  },
-  json_format='records',
-  lines='true',
-  timestampformat='%Y-%m-%dT%H:%M:%SZ'
+    'gharchive_gz/*.json.gz',
+    columns = {
+        id: 'BIGINT',
+        type: 'VARCHAR',
+        actor: 'STRUCT(id UBIGINT,
+                      login VARCHAR,
+                      display_login VARCHAR,
+                      gravatar_id VARCHAR,
+                      url VARCHAR,
+                      avatar_url VARCHAR)',
+        repo: 'STRUCT(id UBIGINT, name VARCHAR, url VARCHAR)',
+        payload: 'JSON',
+        public: 'BOOLEAN',
+        created_at: 'TIMESTAMP',
+        org: 'STRUCT(id UBIGINT, login VARCHAR, gravatar_id VARCHAR, url VARCHAR, avatar_url VARCHAR)'
+    },
+    json_format = 'records',
+    lines = 'true',
+    timestampformat = '%Y-%m-%dT%H:%M:%SZ'
 )
 WHERE type = 'PullRequestEvent';
 ```
@@ -623,7 +625,7 @@ This will load the `"payload"` field as a JSON string, and we can use DuckDB's J
 For example:
 
 ```sql
-SELECT DISTINCT payload->>'action' AS action, count(*) count
+SELECT DISTINCT payload->>'action' AS action, count(*) AS count
 FROM pr_events
 GROUP BY action
 ORDER BY count DESC;