From 7d56b43988d42f670045cb983c9d64663bec1840 Mon Sep 17 00:00:00 2001
From: Ivor Bosloper <ivorbosloper@gmail.com>
Date: Wed, 19 Mar 2025 20:46:21 +0100
Subject: [PATCH 1/3] WIP create history command

---
 fiboa_cli/__init__.py | 49 ++++++++++++++++++++++++++++
 fiboa_cli/history.py  | 76 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 125 insertions(+)
 create mode 100644 fiboa_cli/history.py

diff --git a/fiboa_cli/__init__.py b/fiboa_cli/__init__.py
index 3ce3a12e..fda0d552 100644
--- a/fiboa_cli/__init__.py
+++ b/fiboa_cli/__init__.py
@@ -12,6 +12,7 @@
 from .create_geojson import create_geojson as create_geojson_
 from .create_geoparquet import create_geoparquet as create_geoparquet_
 from .describe import describe as describe_
+from .history import history as history_
 from .improve import improve as improve_
 from .jsonschema import jsonschema as jsonschema_
 from .merge import DEFAULT_CRS
@@ -753,6 +754,53 @@ def improve(
         sys.exit(1)
 
 
+## History; given several years of data, generate a historic set
+## Currently works for crops-years
+@click.command()
+@click.argument("input", nargs=-1, type=click.Path(exists=True))
+@click.option(
+    "--out",
+    "-o",
+    type=click.Path(exists=False),
+    help="Path to write the GeoParquet file to. If not given, generate name",
+    default=None,
+)
+@click.option(
+    "--column-filter",
+    "-c",
+    type=str,
+    help="Comma seperated list of column_names to analyze historically",
+    default=None,
+)
+@click.option(
+    "--compression",
+    "-pc",
+    type=click.Choice(COMPRESSION_METHODS),
+    help="Compression method for the Parquet file.",
+    show_default=True,
+    default="brotli",
+)
+def history(
+    input,
+    out,
+    column_filter,
+    compression,
+):
+    """
+    Given several years of data, generate a dataset with historic data per column
+    """
+    log(f"fiboa CLI {__version__} - Generate history\n", "success")
+    try:
+        history_(
+            input,
+            out,
+            column_filter,
+            compression,
+        )
+    except Exception as e:
+        log(e, "error")
+        sys.exit(1)
+
 cli.add_command(describe)
 cli.add_command(validate)
 cli.add_command(validate_schema)
@@ -764,6 +812,7 @@ def improve(
 cli.add_command(rename_extension)
 cli.add_command(merge)
 cli.add_command(improve)
+cli.add_command(history)
 
 if __name__ == "__main__":
     cli()
diff --git a/fiboa_cli/history.py b/fiboa_cli/history.py
new file mode 100644
index 00000000..ecb8685c
--- /dev/null
+++ b/fiboa_cli/history.py
@@ -0,0 +1,76 @@
+import os
+
+from .const import CORE_COLUMNS
+from .parquet import create_parquet
+from .util import (
+    is_schema_empty,
+    load_parquet_data,
+    load_parquet_schema,
+    log,
+    parse_metadata,
+    pick_schemas,
+)
+import re
+
+COLUMNS =  ("crop:code", "crop:name", "crop:name_en", "ec:hcat_name", "ec:hcat_code", "ec:translated_name")
+
+def history(
+    input,
+    out=None,
+    column_filter=None,
+    compression=None,
+):
+    # alternatively, lookup from determination_datetime (does not work in all cases
+    year_index = {int(re.search(r"\d{4}", i).group(0)): index for index, i in enumerate(input)}
+    assert len(year_index) == len(input), "Different input files with same year not implemented"
+    newest_index = list(year_index).index(max(year_index))
+    newest_file = input[newest_index]
+    if not out:
+        out = newest_file.replace(".parquet", "_hist.parquet")
+    else:
+        directory = os.path.dirname(out)
+        if directory:
+            os.makedirs(directory, exist_ok=True)
+
+    # Load the dataset
+    schemas = [load_parquet_schema(i) for i in input]
+    collections = [parse_metadata(schema, b"fiboa") for schema in schemas]
+
+    gdf = load_parquet_data(newest_file)
+    columns = list(schemas[newest_index].names)
+    for year, index in year_index.items():
+        if index == newest_index:
+            continue
+        add_columns = [name for name in schemas[index].names if name in (column_filter or COLUMNS)]
+        path = input[index]
+        if len(add_columns) == 0:
+            log("No columns added for file {path} year {year}", "warning")
+            continue
+        new_columns = [f"{year}:{c}" for c in add_columns]
+        gdf2 = load_parquet_data(path, columns=add_columns + [gdf.active_geometry_name])
+
+        # https://geopandas.org/en/stable/docs/user_guide/set_operations.html
+        # https://geopandas.org/en/stable/docs/reference/api/geopandas.overlay.html#geopandas.overlay
+        overlap = gdf.overlay(gdf2, how='intersection')
+        # Add area column
+
+        # Determine whether the given CRS is in meters
+        if gdf.crs.axis_info[0].unit_name not in ["m", "metre", "meter"]:
+            # Reproject the geometries to an equal-area projection if needed
+            overlap = overlap.to_crs("EPSG:6933")
+
+        # Compute the missing area and perimeter values
+        overlap["area"] = overlap.geometry.area * 0.0001
+
+        # TODO,
+        # group by id_1, look for max(crop:name, key=area), and add this as a column to gdf
+        # Start debugging here!
+
+    # Write the merged dataset to the output file
+    # TODO, create proper collection
+    collection = collections[1]
+
+    create_parquet(
+        gdf, columns, collection, out, {}, compression=compression
+    )
+    log(f"Wrote data to {out}", "success")

From 363aa41bfdc5670b2c3dd091df555dd020212680 Mon Sep 17 00:00:00 2001
From: Ivor Bosloper <ivorbosloper@gmail.com>
Date: Wed, 19 Mar 2025 22:16:17 +0100
Subject: [PATCH 2/3] update

---
 fiboa_cli/history.py | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/fiboa_cli/history.py b/fiboa_cli/history.py
index ecb8685c..90fc9896 100644
--- a/fiboa_cli/history.py
+++ b/fiboa_cli/history.py
@@ -47,21 +47,15 @@ def history(
             log("No columns added for file {path} year {year}", "warning")
             continue
         new_columns = [f"{year}:{c}" for c in add_columns]
-        gdf2 = load_parquet_data(path, columns=add_columns + [gdf.active_geometry_name])
 
-        # https://geopandas.org/en/stable/docs/user_guide/set_operations.html
-        # https://geopandas.org/en/stable/docs/reference/api/geopandas.overlay.html#geopandas.overlay
-        overlap = gdf.overlay(gdf2, how='intersection')
-        # Add area column
+        gdf2 = load_parquet_data(path, columns=add_columns + [gdf.active_geometry_name])
+        overlap = gdf[["id", "geometry"]].overlay(gdf2, how='intersection')
 
-        # Determine whether the given CRS is in meters
         if gdf.crs.axis_info[0].unit_name not in ["m", "metre", "meter"]:
-            # Reproject the geometries to an equal-area projection if needed
             overlap = overlap.to_crs("EPSG:6933")
-
-        # Compute the missing area and perimeter values
         overlap["area"] = overlap.geometry.area * 0.0001
 
+        overlap.groupby(["id_1"])
         # TODO,
         # group by id_1, look for max(crop:name, key=area), and add this as a column to gdf
         # Start debugging here!

From e4aa8e953fc840f1879cf1e3b39ee171b3a67c80 Mon Sep 17 00:00:00 2001
From: Ivor Bosloper <ivorbosloper@gmail.com>
Date: Thu, 27 Mar 2025 17:11:49 +0100
Subject: [PATCH 3/3] WIP

---
 fiboa_cli/history.py | 44 +++++++++++++++++++++++++++++++++-----------
 1 file changed, 33 insertions(+), 11 deletions(-)

diff --git a/fiboa_cli/history.py b/fiboa_cli/history.py
index 90fc9896..676b3e04 100644
--- a/fiboa_cli/history.py
+++ b/fiboa_cli/history.py
@@ -1,16 +1,14 @@
 import os
-
-from .const import CORE_COLUMNS
 from .parquet import create_parquet
 from .util import (
-    is_schema_empty,
     load_parquet_data,
     load_parquet_schema,
     log,
     parse_metadata,
-    pick_schemas,
 )
 import re
+import pandas as pd
+import numpy as np
 
 COLUMNS =  ("crop:code", "crop:name", "crop:name_en", "ec:hcat_name", "ec:hcat_code", "ec:translated_name")
 
@@ -37,6 +35,12 @@ def history(
     collections = [parse_metadata(schema, b"fiboa") for schema in schemas]
 
     gdf = load_parquet_data(newest_file)
+    # TODO how to handle non-unique ids, maybe generate an additional "_a"
+    # https://stackoverflow.com/a/26601343/193886
+    gdf.drop_duplicates('id', inplace=True)
+
+    # gdf[gdf.index.duplicated()].sort_values(by='id)
+    gdf.set_index("id", drop=False, inplace=True)
     columns = list(schemas[newest_index].names)
     for year, index in year_index.items():
         if index == newest_index:
@@ -46,19 +50,37 @@ def history(
         if len(add_columns) == 0:
             log("No columns added for file {path} year {year}", "warning")
             continue
-        new_columns = [f"{year}:{c}" for c in add_columns]
 
-        gdf2 = load_parquet_data(path, columns=add_columns + [gdf.active_geometry_name])
-        overlap = gdf[["id", "geometry"]].overlay(gdf2, how='intersection')
+        gdf2 = load_parquet_data(path, columns=add_columns + [gdf.active_geometry_name], nrows=1000)
+        overlap = gdf[["id", "geometry"]].overlay(gdf2, how='intersection', keep_geom_type=False)
 
         if gdf.crs.axis_info[0].unit_name not in ["m", "metre", "meter"]:
             overlap = overlap.to_crs("EPSG:6933")
         overlap["area"] = overlap.geometry.area * 0.0001
 
-        overlap.groupby(["id_1"])
-        # TODO,
-        # group by id_1, look for max(crop:name, key=area), and add this as a column to gdf
-        # Start debugging here!
+        # largest_overlap = overlap.groupby(["id_1"])['area'].nlargest(1)
+
+        groupby = ["id", *add_columns]
+        subset = overlap[groupby + ["area"]]
+        area_per_group = subset.groupby(groupby, as_index=False).sum("area")
+        # largest_overlap = area_per_group.groupby(groupby, as_index=False)['area'].nlargest(1)
+
+        largest_overlap = area_per_group.loc[area_per_group.groupby("id")["area"].idxmax()]
+        largest_overlap.set_index("id", inplace=True)
+
+        # largest_overlap = area_per_group.groupby("id", as_index=False)[['area', 'crop:code', 'crop:name']][0]
+        # Use same index https://stackoverflow.com/a/72932903/193886
+        largest_overlap[str(year)] = largest_overlap[add_columns].to_dict("records")
+        gdf = gdf.assign(**{str(year): largest_overlap[str(year)]})
+        breakpoint()
+
+        # largest_overlap.loc["105449105.0"]
+        # gdf.loc[gdf["id"]==largest_overlap["id"]]["history"] = 10
+        # https://stackoverflow.com/a/70991362/193886
+        # gdf['history'] = np.where(gdf['id'].reset_index(drop=True) == largest_overlap['id'].reset_index(drop=True), largest_overlap['crop:name'], None)
+
+        # pd.merge(gdf, largest_overlap, on="id")
+
 
     # Write the merged dataset to the output file
     # TODO, create proper collection