From 7d56b43988d42f670045cb983c9d64663bec1840 Mon Sep 17 00:00:00 2001 From: Ivor Bosloper Date: Wed, 19 Mar 2025 20:46:21 +0100 Subject: [PATCH 1/3] WIP create history command --- fiboa_cli/__init__.py | 49 ++++++++++++++++++++++++++++ fiboa_cli/history.py | 76 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 125 insertions(+) create mode 100644 fiboa_cli/history.py diff --git a/fiboa_cli/__init__.py b/fiboa_cli/__init__.py index 3ce3a12e..fda0d552 100644 --- a/fiboa_cli/__init__.py +++ b/fiboa_cli/__init__.py @@ -12,6 +12,7 @@ from .create_geojson import create_geojson as create_geojson_ from .create_geoparquet import create_geoparquet as create_geoparquet_ from .describe import describe as describe_ +from .history import history as history_ from .improve import improve as improve_ from .jsonschema import jsonschema as jsonschema_ from .merge import DEFAULT_CRS @@ -753,6 +754,53 @@ def improve( sys.exit(1) +## History; given several years of data, generate a historic set +## Currently works for crops-years +@click.command() +@click.argument("input", nargs=-1, type=click.Path(exists=True)) +@click.option( + "--out", + "-o", + type=click.Path(exists=False), + help="Path to write the GeoParquet file to. If not given, generate name", + default=None, +) +@click.option( + "--column-filter", + "-c", + type=str, + help="Comma seperated list of column_names to analyze historically", + default=None, +) +@click.option( + "--compression", + "-pc", + type=click.Choice(COMPRESSION_METHODS), + help="Compression method for the Parquet file.", + show_default=True, + default="brotli", +) +def history( + input, + out, + column_filter, + compression, +): + """ + Given several years of data, generate a dataset with historic data per column + """ + log(f"fiboa CLI {__version__} - Generate history\n", "success") + try: + history_( + input, + out, + column_filter, + compression, + ) + except Exception as e: + log(e, "error") + sys.exit(1) + cli.add_command(describe) cli.add_command(validate) cli.add_command(validate_schema) @@ -764,6 +812,7 @@ def improve( cli.add_command(rename_extension) cli.add_command(merge) cli.add_command(improve) +cli.add_command(history) if __name__ == "__main__": cli() diff --git a/fiboa_cli/history.py b/fiboa_cli/history.py new file mode 100644 index 00000000..ecb8685c --- /dev/null +++ b/fiboa_cli/history.py @@ -0,0 +1,76 @@ +import os + +from .const import CORE_COLUMNS +from .parquet import create_parquet +from .util import ( + is_schema_empty, + load_parquet_data, + load_parquet_schema, + log, + parse_metadata, + pick_schemas, +) +import re + +COLUMNS = ("crop:code", "crop:name", "crop:name_en", "ec:hcat_name", "ec:hcat_code", "ec:translated_name") + +def history( + input, + out=None, + column_filter=None, + compression=None, +): + # alternatively, lookup from determination_datetime (does not work in all cases + year_index = {int(re.search(r"\d{4}", i).group(0)): index for index, i in enumerate(input)} + assert len(year_index) == len(input), "Different input files with same year not implemented" + newest_index = list(year_index).index(max(year_index)) + newest_file = input[newest_index] + if not out: + out = newest_file.replace(".parquet", "_hist.parquet") + else: + directory = os.path.dirname(out) + if directory: + os.makedirs(directory, exist_ok=True) + + # Load the dataset + schemas = [load_parquet_schema(i) for i in input] + collections = [parse_metadata(schema, b"fiboa") for schema in schemas] + + gdf = load_parquet_data(newest_file) + columns = list(schemas[newest_index].names) + for year, index in year_index.items(): + if index == newest_index: + continue + add_columns = [name for name in schemas[index].names if name in (column_filter or COLUMNS)] + path = input[index] + if len(add_columns) == 0: + log("No columns added for file {path} year {year}", "warning") + continue + new_columns = [f"{year}:{c}" for c in add_columns] + gdf2 = load_parquet_data(path, columns=add_columns + [gdf.active_geometry_name]) + + # https://geopandas.org/en/stable/docs/user_guide/set_operations.html + # https://geopandas.org/en/stable/docs/reference/api/geopandas.overlay.html#geopandas.overlay + overlap = gdf.overlay(gdf2, how='intersection') + # Add area column + + # Determine whether the given CRS is in meters + if gdf.crs.axis_info[0].unit_name not in ["m", "metre", "meter"]: + # Reproject the geometries to an equal-area projection if needed + overlap = overlap.to_crs("EPSG:6933") + + # Compute the missing area and perimeter values + overlap["area"] = overlap.geometry.area * 0.0001 + + # TODO, + # group by id_1, look for max(crop:name, key=area), and add this as a column to gdf + # Start debugging here! + + # Write the merged dataset to the output file + # TODO, create proper collection + collection = collections[1] + + create_parquet( + gdf, columns, collection, out, {}, compression=compression + ) + log(f"Wrote data to {out}", "success") From 363aa41bfdc5670b2c3dd091df555dd020212680 Mon Sep 17 00:00:00 2001 From: Ivor Bosloper Date: Wed, 19 Mar 2025 22:16:17 +0100 Subject: [PATCH 2/3] update --- fiboa_cli/history.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/fiboa_cli/history.py b/fiboa_cli/history.py index ecb8685c..90fc9896 100644 --- a/fiboa_cli/history.py +++ b/fiboa_cli/history.py @@ -47,21 +47,15 @@ def history( log("No columns added for file {path} year {year}", "warning") continue new_columns = [f"{year}:{c}" for c in add_columns] - gdf2 = load_parquet_data(path, columns=add_columns + [gdf.active_geometry_name]) - # https://geopandas.org/en/stable/docs/user_guide/set_operations.html - # https://geopandas.org/en/stable/docs/reference/api/geopandas.overlay.html#geopandas.overlay - overlap = gdf.overlay(gdf2, how='intersection') - # Add area column + gdf2 = load_parquet_data(path, columns=add_columns + [gdf.active_geometry_name]) + overlap = gdf[["id", "geometry"]].overlay(gdf2, how='intersection') - # Determine whether the given CRS is in meters if gdf.crs.axis_info[0].unit_name not in ["m", "metre", "meter"]: - # Reproject the geometries to an equal-area projection if needed overlap = overlap.to_crs("EPSG:6933") - - # Compute the missing area and perimeter values overlap["area"] = overlap.geometry.area * 0.0001 + overlap.groupby(["id_1"]) # TODO, # group by id_1, look for max(crop:name, key=area), and add this as a column to gdf # Start debugging here! From e4aa8e953fc840f1879cf1e3b39ee171b3a67c80 Mon Sep 17 00:00:00 2001 From: Ivor Bosloper Date: Thu, 27 Mar 2025 17:11:49 +0100 Subject: [PATCH 3/3] WIP --- fiboa_cli/history.py | 44 +++++++++++++++++++++++++++++++++----------- 1 file changed, 33 insertions(+), 11 deletions(-) diff --git a/fiboa_cli/history.py b/fiboa_cli/history.py index 90fc9896..676b3e04 100644 --- a/fiboa_cli/history.py +++ b/fiboa_cli/history.py @@ -1,16 +1,14 @@ import os - -from .const import CORE_COLUMNS from .parquet import create_parquet from .util import ( - is_schema_empty, load_parquet_data, load_parquet_schema, log, parse_metadata, - pick_schemas, ) import re +import pandas as pd +import numpy as np COLUMNS = ("crop:code", "crop:name", "crop:name_en", "ec:hcat_name", "ec:hcat_code", "ec:translated_name") @@ -37,6 +35,12 @@ def history( collections = [parse_metadata(schema, b"fiboa") for schema in schemas] gdf = load_parquet_data(newest_file) + # TODO how to handle non-unique ids, maybe generate an additional "_a" + # https://stackoverflow.com/a/26601343/193886 + gdf.drop_duplicates('id', inplace=True) + + # gdf[gdf.index.duplicated()].sort_values(by='id) + gdf.set_index("id", drop=False, inplace=True) columns = list(schemas[newest_index].names) for year, index in year_index.items(): if index == newest_index: @@ -46,19 +50,37 @@ def history( if len(add_columns) == 0: log("No columns added for file {path} year {year}", "warning") continue - new_columns = [f"{year}:{c}" for c in add_columns] - gdf2 = load_parquet_data(path, columns=add_columns + [gdf.active_geometry_name]) - overlap = gdf[["id", "geometry"]].overlay(gdf2, how='intersection') + gdf2 = load_parquet_data(path, columns=add_columns + [gdf.active_geometry_name], nrows=1000) + overlap = gdf[["id", "geometry"]].overlay(gdf2, how='intersection', keep_geom_type=False) if gdf.crs.axis_info[0].unit_name not in ["m", "metre", "meter"]: overlap = overlap.to_crs("EPSG:6933") overlap["area"] = overlap.geometry.area * 0.0001 - overlap.groupby(["id_1"]) - # TODO, - # group by id_1, look for max(crop:name, key=area), and add this as a column to gdf - # Start debugging here! + # largest_overlap = overlap.groupby(["id_1"])['area'].nlargest(1) + + groupby = ["id", *add_columns] + subset = overlap[groupby + ["area"]] + area_per_group = subset.groupby(groupby, as_index=False).sum("area") + # largest_overlap = area_per_group.groupby(groupby, as_index=False)['area'].nlargest(1) + + largest_overlap = area_per_group.loc[area_per_group.groupby("id")["area"].idxmax()] + largest_overlap.set_index("id", inplace=True) + + # largest_overlap = area_per_group.groupby("id", as_index=False)[['area', 'crop:code', 'crop:name']][0] + # Use same index https://stackoverflow.com/a/72932903/193886 + largest_overlap[str(year)] = largest_overlap[add_columns].to_dict("records") + gdf = gdf.assign(**{str(year): largest_overlap[str(year)]}) + breakpoint() + + # largest_overlap.loc["105449105.0"] + # gdf.loc[gdf["id"]==largest_overlap["id"]]["history"] = 10 + # https://stackoverflow.com/a/70991362/193886 + # gdf['history'] = np.where(gdf['id'].reset_index(drop=True) == largest_overlap['id'].reset_index(drop=True), largest_overlap['crop:name'], None) + + # pd.merge(gdf, largest_overlap, on="id") + # Write the merged dataset to the output file # TODO, create proper collection