diff --git a/setup.py b/setup.py index e69de29..b31ce24 100644 --- a/setup.py +++ b/setup.py @@ -0,0 +1,18 @@ +from setuptools import setup + +setup( + author='Ryan Hamilton', + author_email='ryan.hamilton@ec.gc.ca', + name='ee_data_eng', + version='0.1.0', + package_dir={'': 'src'}, + install_requires=[ + 'click', + 'geopandas', + 'pandas' + ], + entry_points=''' + [console_scripts] + ee_data_eng=ee_data_eng:cli + ''' +) \ No newline at end of file diff --git a/src/ee_data_eng.py b/src/ee_data_eng.py index e69de29..e5512b4 100644 --- a/src/ee_data_eng.py +++ b/src/ee_data_eng.py @@ -0,0 +1,51 @@ +import click + +import geopandas as gpd +import pandas as pd + +from pathlib import Path + +""" +v0.1.0 +- Add click command line interface +- Args, training_data, validation_data +- take both spatial file and combine them into one dataframe and save it to a file +- class_int +""" + + +@click.command() +@click.argument('training_data', type=click.Path(exists=False)) +@click.argument('validation_data', type=click.Path(exists=False)) +@click.option('--label_col', default="class_name", help='Column that contains the class labels') +@click.option('--output', default="processed/combined.shp", help='Output file path') +def cli(training_data, validation_data, label_col, output): + click.echo(f'Loading training data from {training_data}') + gdf_train = gpd.read_file(training_data) + gdf_train['split'] = 'train' + click.echo(gdf_train.head()) + + click.echo(f'Loading validation data from {validation_data}') + gdf_val = gpd.read_file(validation_data) + gdf_val['split'] = 'test' + click.echo(gdf_val.head()) + + click.echo('Combining training and validation data') + gdf = pd.concat([gdf_train, gdf_val]) + + # to create lookup table need key, value + # key is the class_name + # value is the class_int + key = gdf[label_col].unique().tolist() + value = list(range(1, len(key)+1)) + lookup = pd.DataFrame({'class_name': key, 'class_int': value}) + click.echo(lookup) + gdf = gdf.merge(lookup, on='class_name') + + gdf = gdf[[label_col, 'class_int', 'split', 'geometry']] + + output = Path(output) + if not output.parent.exists(): + output.parent.mkdir(exist_ok=True) + + gdf.to_file(output, driver='ESRI Shapefile')