Skip to content

Commit

Permalink
feat: supporting spaces in column names for csv files (#1388)
Browse files Browse the repository at this point in the history
takes `reverse quote id`, removes back ticks, and converts it to `simple
id`.

---------

Co-authored-by: americast <sayan.sinha@cc.gatech.edu>
Co-authored-by: Andy Xu <xzdandy@gmail.com>
  • Loading branch information
3 people authored Dec 3, 2023
1 parent c2457b2 commit e5a9190
Show file tree
Hide file tree
Showing 4 changed files with 95 additions and 0 deletions.
13 changes: 13 additions & 0 deletions docs/source/overview/faq.rst
Original file line number Diff line number Diff line change
Expand Up @@ -34,3 +34,16 @@ If a query runs a complex AI task (e.g., sentiment analysis) on a large table, t
top
pgrep evadb_server
Can column names have space?
----------------------------

For column names with space, you can use reverse quote to contain the column names. Below are example `CREATE TABLE` and `SELECT` queries:

.. code-block:: sql
CREATE TABLE IF NOT EXISTS MyVideoCSV (
id INTEGER UNIQUE,
`frame id` INTEGER,
);
SELECT id, `frame id` FROM MyVideoCSV;
7 changes: 7 additions & 0 deletions evadb/parser/lark_visitor/_common_clauses_ids.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,13 @@ def full_id(self, tree):
return (self.visit(tree.children[0]), self.visit(tree.children[1]))

def uid(self, tree):
if (
hasattr(tree.children[0], "type")
and tree.children[0].type == "REVERSE_QUOTE_ID"
):
tree.children[0].type = "simple_id"
non_tick_string = str(tree.children[0]).replace("`", "")
return non_tick_string
return self.visit(tree.children[0])

def full_column_name(self, tree):
Expand Down
44 changes: 44 additions & 0 deletions test/integration_tests/short/test_load_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import unittest
from pathlib import Path
from test.util import (
create_csv_with_comlumn_name_spaces,
create_dummy_csv_batches,
create_sample_csv,
create_sample_video,
Expand Down Expand Up @@ -117,6 +118,49 @@ def test_should_load_csv_in_table(self):
drop_query = "DROP TABLE IF EXISTS MyVideoCSV;"
execute_query_fetch_all(self.evadb, drop_query)

###################################
# integration tests for csv files with spaces in column names
def test_should_load_csv_in_table_with_spaces_in_column_name(self):
# loading a csv requires a table to be created first
create_table_query = """
CREATE TABLE IF NOT EXISTS MyVideoCSV (
id INTEGER UNIQUE,
`frame id` INTEGER,
`video id` INTEGER,
`dataset name` TEXT(30),
label TEXT(30),
bbox NDARRAY FLOAT32(4),
`object id` INTEGER
);
"""
execute_query_fetch_all(self.evadb, create_table_query)

# load the CSV
load_query = (
f"LOAD CSV '{create_csv_with_comlumn_name_spaces()}' INTO MyVideoCSV;"
)
execute_query_fetch_all(self.evadb, load_query)

# execute a select query
select_query = """SELECT id, `frame id`, `video id`,
`dataset name`, label, bbox,
`object id`
FROM MyVideoCSV;"""

actual_batch = execute_query_fetch_all(self.evadb, select_query)
actual_batch.sort()

# assert the batches are equal
expected_batch = next(create_dummy_csv_batches())
expected_batch.modify_column_alias("myvideocsv")
self.assertEqual(actual_batch, expected_batch)

# clean up
drop_query = "DROP TABLE IF EXISTS MyVideoCSV;"
execute_query_fetch_all(self.evadb, drop_query)


if __name__ == "__main__":
unittest.main()
31 changes: 31 additions & 0 deletions test/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -319,6 +319,37 @@ def create_sample_csv(num_frames=NUM_FRAMES):
return os.path.join(get_tmp_dir(), "dummy.csv")


def create_csv_with_comlumn_name_spaces(num_frames=NUM_FRAMES):
try:
os.remove(os.path.join(get_tmp_dir(), "dummy.csv"))
except FileNotFoundError:
pass

sample_meta = {}

index = 0
sample_labels = ["car", "pedestrian", "bicycle"]
num_videos = 2
for video_id in range(num_videos):
for frame_id in range(num_frames):
random_coords = 200 + 300 * np.random.random(4)
sample_meta[index] = {
"id": index,
"frame id": frame_id,
"video id": video_id,
"dataset name": "test_dataset",
"label": sample_labels[np.random.choice(len(sample_labels))],
"bbox": ",".join([str(coord) for coord in random_coords]),
"object id": np.random.choice(3),
}

index += 1

df_sample_meta = pd.DataFrame.from_dict(sample_meta, "index")
df_sample_meta.to_csv(os.path.join(get_tmp_dir(), "dummy.csv"), index=False)
return os.path.join(get_tmp_dir(), "dummy.csv")


def create_dummy_csv_batches(target_columns=None):
if target_columns:
df = pd.read_csv(
Expand Down

0 comments on commit e5a9190

Please # to comment.