Skip to content

Commit

Permalink
Add the ability to provide arguments and unique values (#9)
Browse files Browse the repository at this point in the history
* Update readme for module extras
* Add provider arguments and uniqueness support
  • Loading branch information
dacort authored Feb 2, 2024
1 parent 342ea0c commit 5969e16
Show file tree
Hide file tree
Showing 5 changed files with 113 additions and 26 deletions.
21 changes: 18 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,8 @@ fake -n 10 pyint,user_name,date_this_year -f json
{"pyint": 5306, "user_name": "mark12", "date_this_year": "2023-04-16"}
```

### Column Names

Default column names aren't good enough for you? Fine, use your own.

```bash
Expand All @@ -85,9 +87,23 @@ fake -n 10 pyint,user_name,date_this_year -f json -c id,awesome_name,last_attent
{"id": 1967, "awesome_name": "jmendoza", "last_attention_at": "2023-01-23"}
```

### Provider Arguments

Some [Faker providers](https://faker.readthedocs.io/en/master/providers/baseprovider.html) (like `pyint`) take arguments. You can also specify those if you like, separated by semi-colons (_because some arguments take a comma-separated string :)_)

```bash
fake -n 10 "pyint(1;100),credit_card_number(amex),pystr_format(?#-####)" -f json -c id,credit_card_number,license_plate
```

And unique values are supported as well.

```bash
fake -n 10 "unique.pyint(1;10),unique.name"
```

### Parquet

OK, it had to happen, you can even write Parquet.
OK, it had to happen, you can even write Parquet.

Install with the `parquet` module: `pip install faker-cli[parquet]`

Expand All @@ -111,7 +127,6 @@ Install with the `delta` module: `pip install faker-cli[delta]`
fake -n 10 pyint,user_name,date_this_year -f deltalake -o sample_data
```


## Templates

The libary includes a couple templates that can be used to generate certain types of fake data easier.
Expand All @@ -130,4 +145,4 @@ How about CloudFront? Go ahead.
fake -t cloudfront -n 10
```

> **Warning**: Both of these templates are still being validated - please be cautious!
> **Warning**: Both of these templates are still being validated - please be cautious!
35 changes: 17 additions & 18 deletions faker_cli/cli.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import sys
from typing import List

import click
from faker import Faker

from faker_cli.parser import infer_column_names, parse_column_types
from faker_cli.templates import (
CloudFrontLogs,
CloudFrontWriter,
Expand All @@ -12,18 +12,6 @@
)
from faker_cli.writer import CSVWriter, JSONWriter


def infer_column_names(col_names, col_types: str) -> List[str]:
"""
Infer column names from column types
"""
# For now, nothing special - but eventually we need to parse things out
if col_names:
return col_names.split(",")

return col_types.split(",")


KLAS_MAPPER = {
"csv": CSVWriter,
"json": JSONWriter,
Expand Down Expand Up @@ -108,11 +96,22 @@ def main(num_rows, format, output, columns, template, column_types):
return

# Now, if a template hasn't been provided, generate some fake data!
col_types = column_types.split(",")
# col_types = column_types.split(",")
# Note that if args are provided, the column headers are less than ideal
col_types = parse_column_types(column_types)
headers = infer_column_names(columns, column_types)
writer = KLAS_MAPPER.get(format)(sys.stdout, headers, output)
format_klas = KLAS_MAPPER.get(format)
if format_klas is None:
raise click.ClickException(f"Format {format} not supported.")
writer = format_klas(sys.stdout, headers, output)
for i in range(num_rows):
# TODO: Handle args
row = [fake.format(ctype) for ctype in col_types]
writer.write(row)
writer.write(generate_row(fake, col_types))
writer.close()

def generate_row(fake: Faker, column_types: list[tuple[str, list]]) -> list[str]:
return [
fake.format(ctype, *args)
if not ctype.startswith("unique.")
else fake.unique.format(ctype.removeprefix("unique."), *args)
for ctype, args in column_types
]
39 changes: 39 additions & 0 deletions faker_cli/parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
from typing import List, Tuple

def infer_column_names(col_names, col_types: str) -> List[str]:
"""
Infer column names from column types
"""
# For now, nothing special - but eventually we need to parse things out
if col_names:
return col_names.split(",")

return col_types.split(",")


def parse_column_types(input_string: str) -> List[Tuple[str, List]]:
"""
Parse a string of the format "pyint(1;10),datetime,profile(ssn,birthdate)" and split it by commas with optional parenthese-inclosed arguments.
"""
import re

pattern = r"([\w\.]+)(\([^)]*\))?"
matches = re.findall(pattern, input_string)

# Extract the matched groups
result = [
(match[0], ([try_convert_to_int(val) for val in match[1].strip("()").split(";")] if match[1] else []))
for match in matches
]

# print(result)
# [('pyint', ['1', '10']), ('datetime', []), ('profile', ['ssn,birthdate'])]

return result


def try_convert_to_int(s):
try:
return int(s)
except ValueError:
return s
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "faker-cli"
version = "0.4.0"
version = "0.5.0"
description = "Command-line fake data generator"
authors = ["Damon P. Cortesi <d.lifehacker@gmail.com>"]
readme = "README.md"
Expand Down
42 changes: 38 additions & 4 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
from faker_cli.cli import main
from click.testing import CliRunner
import json

import deltalake
from click.testing import CliRunner

from faker_cli.cli import main


# Test that help is provided if the user provides no arguments
Expand Down Expand Up @@ -44,6 +46,7 @@ def test_numlines():
lines = result.output.strip().splitlines()
assert len(lines) == (6 if format == "csv" else 5)


def test_custom_column_names():
runner = CliRunner()
result = runner.invoke(main, ["pyint,user_name", "-f", "json", "-c", "first,second"])
Expand All @@ -53,9 +56,10 @@ def test_custom_column_names():
assert len(data.keys()) == 2
assert list(data) == ["first", "second"]


def test_deltalake_output(tmp_path):
runner = CliRunner()
file = tmp_path / 'table'
file = tmp_path / "table"
result = runner.invoke(main, ["pyint,user_name", "-f", "deltalake", "-o", file])
assert result.exit_code == 0
delta_table = deltalake.DeltaTable(file)
Expand All @@ -65,4 +69,34 @@ def test_deltalake_output(tmp_path):

column_names = arrow_table.column_names
assert column_names == ["pyint", "user_name"]
assert arrow_table.num_columns == 2
assert arrow_table.num_columns == 2


def test_provider_args():
runner = CliRunner()
result = runner.invoke(main, ["-n", "10", "pyint(1;10)", "-f", "json", "-c", "id"])
assert result.exit_code == 0
lines = result.output.strip().splitlines()
for line in lines:
data: dict = json.loads(line)
assert data["id"] in range(1, 11)


def test_unique_provider_args():
runner = CliRunner()
result = runner.invoke(main, ["-n", "10", "unique.pyint(1;10)", "-f", "json", "-c", "id"])
assert result.exit_code == 0
lines = result.output.strip().splitlines()
vals = []
for line in lines:
data: dict = json.loads(line)
assert data["id"] in range(1, 11)
vals.append(data["id"])

assert set(vals) == set(range(1, 11))


def test_unique_provider_args_limit():
runner = CliRunner()
result = runner.invoke(main, ["-n", "10", "unique.pyint(1;5)", "-f", "json", "-c", "id"])
assert result.exit_code == 1

0 comments on commit 5969e16

Please # to comment.