diff --git a/README.md b/README.md index ef721c7..43b77a9 100644 --- a/README.md +++ b/README.md @@ -66,6 +66,8 @@ fake -n 10 pyint,user_name,date_this_year -f json {"pyint": 5306, "user_name": "mark12", "date_this_year": "2023-04-16"} ``` +### Column Names + Default column names aren't good enough for you? Fine, use your own. ```bash @@ -85,9 +87,23 @@ fake -n 10 pyint,user_name,date_this_year -f json -c id,awesome_name,last_attent {"id": 1967, "awesome_name": "jmendoza", "last_attention_at": "2023-01-23"} ``` +### Provider Arguments + +Some [Faker providers](https://faker.readthedocs.io/en/master/providers/baseprovider.html) (like `pyint`) take arguments. You can also specify those if you like, separated by semi-colons (_because some arguments take a comma-separated string :)_) + +```bash +fake -n 10 "pyint(1;100),credit_card_number(amex),pystr_format(?#-####)" -f json -c id,credit_card_number,license_plate +``` + +And unique values are supported as well. + +```bash +fake -n 10 "unique.pyint(1;10),unique.name" +``` + ### Parquet -OK, it had to happen, you can even write Parquet. +OK, it had to happen, you can even write Parquet. Install with the `parquet` module: `pip install faker-cli[parquet]` @@ -111,7 +127,6 @@ Install with the `delta` module: `pip install faker-cli[delta]` fake -n 10 pyint,user_name,date_this_year -f deltalake -o sample_data ``` - ## Templates The libary includes a couple templates that can be used to generate certain types of fake data easier. @@ -130,4 +145,4 @@ How about CloudFront? Go ahead. fake -t cloudfront -n 10 ``` -> **Warning**: Both of these templates are still being validated - please be cautious! \ No newline at end of file +> **Warning**: Both of these templates are still being validated - please be cautious! diff --git a/faker_cli/cli.py b/faker_cli/cli.py index b8d0c8d..adb0af3 100644 --- a/faker_cli/cli.py +++ b/faker_cli/cli.py @@ -1,9 +1,9 @@ import sys -from typing import List import click from faker import Faker +from faker_cli.parser import infer_column_names, parse_column_types from faker_cli.templates import ( CloudFrontLogs, CloudFrontWriter, @@ -12,18 +12,6 @@ ) from faker_cli.writer import CSVWriter, JSONWriter - -def infer_column_names(col_names, col_types: str) -> List[str]: - """ - Infer column names from column types - """ - # For now, nothing special - but eventually we need to parse things out - if col_names: - return col_names.split(",") - - return col_types.split(",") - - KLAS_MAPPER = { "csv": CSVWriter, "json": JSONWriter, @@ -108,11 +96,22 @@ def main(num_rows, format, output, columns, template, column_types): return # Now, if a template hasn't been provided, generate some fake data! - col_types = column_types.split(",") + # col_types = column_types.split(",") + # Note that if args are provided, the column headers are less than ideal + col_types = parse_column_types(column_types) headers = infer_column_names(columns, column_types) - writer = KLAS_MAPPER.get(format)(sys.stdout, headers, output) + format_klas = KLAS_MAPPER.get(format) + if format_klas is None: + raise click.ClickException(f"Format {format} not supported.") + writer = format_klas(sys.stdout, headers, output) for i in range(num_rows): - # TODO: Handle args - row = [fake.format(ctype) for ctype in col_types] - writer.write(row) + writer.write(generate_row(fake, col_types)) writer.close() + +def generate_row(fake: Faker, column_types: list[tuple[str, list]]) -> list[str]: + return [ + fake.format(ctype, *args) + if not ctype.startswith("unique.") + else fake.unique.format(ctype.removeprefix("unique."), *args) + for ctype, args in column_types + ] \ No newline at end of file diff --git a/faker_cli/parser.py b/faker_cli/parser.py new file mode 100644 index 0000000..e3750d3 --- /dev/null +++ b/faker_cli/parser.py @@ -0,0 +1,39 @@ +from typing import List, Tuple + +def infer_column_names(col_names, col_types: str) -> List[str]: + """ + Infer column names from column types + """ + # For now, nothing special - but eventually we need to parse things out + if col_names: + return col_names.split(",") + + return col_types.split(",") + + +def parse_column_types(input_string: str) -> List[Tuple[str, List]]: + """ + Parse a string of the format "pyint(1;10),datetime,profile(ssn,birthdate)" and split it by commas with optional parenthese-inclosed arguments. + """ + import re + + pattern = r"([\w\.]+)(\([^)]*\))?" + matches = re.findall(pattern, input_string) + + # Extract the matched groups + result = [ + (match[0], ([try_convert_to_int(val) for val in match[1].strip("()").split(";")] if match[1] else [])) + for match in matches + ] + + # print(result) + # [('pyint', ['1', '10']), ('datetime', []), ('profile', ['ssn,birthdate'])] + + return result + + +def try_convert_to_int(s): + try: + return int(s) + except ValueError: + return s diff --git a/pyproject.toml b/pyproject.toml index c3bd62b..f8ff96b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "faker-cli" -version = "0.4.0" +version = "0.5.0" description = "Command-line fake data generator" authors = ["Damon P. Cortesi "] readme = "README.md" diff --git a/tests/test_cli.py b/tests/test_cli.py index c6d4250..80eb593 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -1,7 +1,9 @@ -from faker_cli.cli import main -from click.testing import CliRunner import json + import deltalake +from click.testing import CliRunner + +from faker_cli.cli import main # Test that help is provided if the user provides no arguments @@ -44,6 +46,7 @@ def test_numlines(): lines = result.output.strip().splitlines() assert len(lines) == (6 if format == "csv" else 5) + def test_custom_column_names(): runner = CliRunner() result = runner.invoke(main, ["pyint,user_name", "-f", "json", "-c", "first,second"]) @@ -53,9 +56,10 @@ def test_custom_column_names(): assert len(data.keys()) == 2 assert list(data) == ["first", "second"] + def test_deltalake_output(tmp_path): runner = CliRunner() - file = tmp_path / 'table' + file = tmp_path / "table" result = runner.invoke(main, ["pyint,user_name", "-f", "deltalake", "-o", file]) assert result.exit_code == 0 delta_table = deltalake.DeltaTable(file) @@ -65,4 +69,34 @@ def test_deltalake_output(tmp_path): column_names = arrow_table.column_names assert column_names == ["pyint", "user_name"] - assert arrow_table.num_columns == 2 \ No newline at end of file + assert arrow_table.num_columns == 2 + + +def test_provider_args(): + runner = CliRunner() + result = runner.invoke(main, ["-n", "10", "pyint(1;10)", "-f", "json", "-c", "id"]) + assert result.exit_code == 0 + lines = result.output.strip().splitlines() + for line in lines: + data: dict = json.loads(line) + assert data["id"] in range(1, 11) + + +def test_unique_provider_args(): + runner = CliRunner() + result = runner.invoke(main, ["-n", "10", "unique.pyint(1;10)", "-f", "json", "-c", "id"]) + assert result.exit_code == 0 + lines = result.output.strip().splitlines() + vals = [] + for line in lines: + data: dict = json.loads(line) + assert data["id"] in range(1, 11) + vals.append(data["id"]) + + assert set(vals) == set(range(1, 11)) + + +def test_unique_provider_args_limit(): + runner = CliRunner() + result = runner.invoke(main, ["-n", "10", "unique.pyint(1;5)", "-f", "json", "-c", "id"]) + assert result.exit_code == 1