Fix non-functional stackdates in grid.py (#329)

equinor · Jul 6, 2021 · be46913 · be46913
1 parent 499b041
commit be46913
Show file tree

Hide file tree

Showing 2 changed files with 71 additions and 20 deletions.
diff --git a/ecl2df/grid.py b/ecl2df/grid.py
@@ -124,6 +124,10 @@ def rst2df(
     """Return a dataframe with dynamic data from the restart file
     for each cell, at a particular date.
 
+    The dataframe will have a dummy index. The column named
+    "active" refers to the active cell index, and is to be used
+    when merging with the grid geometry dataframe.
+
     Args:
         eclfiles: EclFiles object
         date: datetime.date or list of datetime.date, must
@@ -137,7 +141,7 @@ def rst2df(
          dateinheaders: boolean on whether the date should
             be added to the column headers. Instead of
             SGAS as a column header, you get SGAS@YYYY-MM-DD.
-         stackdates: Default is false. If true, a column
+         stackdates: Default is false. If True, a column
             called DATE will be added and data for all restart
             dates will be added in a stacked manner. Implies
             dateinheaders False.
@@ -233,16 +237,18 @@ def rst2df(
         # Remove columns that are all NaN:
         rst_df.dropna(axis="columns", how="all", inplace=True)
 
+        rst_df.index.name = "active"
+
         rst_dfs[datestr] = rst_df
 
     if not rst_dfs:
         return pd.DataFrame()
 
     if not stackdates:
-        return pd.concat(rst_dfs.values(), axis=1)
+        return pd.concat(rst_dfs.values(), axis=1).reset_index()
+
     rststack = pd.concat(rst_dfs, sort=False).reset_index()
     rststack.rename(columns={"level_0": "DATE"}, inplace=True)
-    del rststack["level_1"]
     return rststack
 
 
@@ -451,21 +457,21 @@ def df(
     any time dependent data from Restart files.
 
     Args:
-        eclfiles (EclFiles): Handle to an Eclipse case
-        vectors (str or list): Vectors to include, wildcards
+        eclfiles: Handle to an Eclipse case
+        vectors: Vectors to include, wildcards
             supported. Used to match both
             INIT vectors and RESTART vectors.
         dropconstants (bool): If true, columns that are constant
             for every cell are dropped.
-        rstdates (list, str or datetime): Restart dates to include in ISO-8601 format.
+        rstdates: Restart dates to include in ISO-8601 format.
             Alternatively, pick from the mnenomics 'first', 'all' and 'last'.
-        dateinheaders (bool): Whether columns with data from UNRST files
+        dateinheaders: Whether columns with data from UNRST files
             should always have the ISO-date embedded in the column header.
-        stackdates (bool): Default is false. If true, a column
+        stackdates: Default is false. If true, a column
             called DATE will be added and data for all restart
             dates will be added in a stacked manner. Implies
             dateinheaders False.
-        zonemap (dict): A zonemap dictionary mapping every K index to a
+        zonemap: A zonemap dictionary mapping every K index to a
             string, which will be put in a column ZONE. If none is provided,
             a zonemap from a default file will be looked for. Provide an empty
             dictionary to avoid looking for the default file, and no ZONE
@@ -482,10 +488,18 @@ def df(
             dateinheaders=dateinheaders,
             stackdates=stackdates,
         )
-    grid_df = pd.concat([gridgeom, initdf, rst_df], axis=1, sort=False)
+    grid_df = gridgeom.merge(
+        initdf, how="outer", on=None, left_index=True, right_index=True
+    )
+
+    if rst_df is not None and not rst_df.empty:
+        grid_df = grid_df.merge(
+            rst_df, how="outer", left_index=True, right_on="active"
+        ).reset_index(drop=True)
+
     if dropconstants:
         grid_df = drop_constant_columns(grid_df)
-    return grid_df
+    return grid_df.drop("active", axis="columns", errors="ignore")
 
 
 def fill_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:

diff --git a/tests/test_grid.py b/tests/test_grid.py
@@ -279,7 +279,7 @@ def test_df():
     grid_df = grid.df(eclfiles, vectors=["PRESSURE"], rstdates="last", stackdates=True)
     assert "PRESSURE" in grid_df
     assert len(grid_df.columns) == geometry_cols + 2
-    assert "DATE" in grid_df  # awaits stacking
+    assert "DATE" in grid_df  # Present because of stackdates
 
     grid_df = grid.df(eclfiles, vectors="PRESSURE", rstdates="last")
     assert "PRESSURE" in grid_df
@@ -289,11 +289,38 @@ def test_df():
     assert "PRESSURE" not in grid_df
     assert "PRESSURE@2001-08-01" in grid_df
 
-    grid_df = grid.df(eclfiles, vectors="PRESSURE", rstdates="all", stackdates=True)
+    grid_df = grid.df(
+        eclfiles, vectors=["PORO", "PRESSURE"], rstdates="all", stackdates=True
+    )
     assert "PRESSURE" in grid_df
-    assert len(grid_df.columns) == geometry_cols + 2
+    assert len(grid_df.columns) == geometry_cols + 3
     assert "DATE" in grid_df
     assert len(grid_df["DATE"].unique()) == 4
+    assert not grid_df.isna().any().any()
+    # Check that all but the dynamic data has been repeated:
+    df1 = (
+        grid_df[grid_df["DATE"] == "2000-01-01"]
+        .drop(["DATE", "PRESSURE"], axis=1)
+        .reset_index(drop=True)
+    )
+    df2 = (
+        grid_df[grid_df["DATE"] == "2000-07-01"]
+        .drop(["PRESSURE", "DATE"], axis=1)
+        .reset_index(drop=True)
+    )
+    df3 = (
+        grid_df[grid_df["DATE"] == "2001-02-01"]
+        .drop(["PRESSURE", "DATE"], axis=1)
+        .reset_index(drop=True)
+    )
+    df4 = (
+        grid_df[grid_df["DATE"] == "2001-08-01"]
+        .drop(["PRESSURE", "DATE"], axis=1)
+        .reset_index(drop=True)
+    )
+    pd.testing.assert_frame_equal(df1, df2)
+    pd.testing.assert_frame_equal(df1, df3)
+    pd.testing.assert_frame_equal(df1, df4)
 
     grid_df = grid.df(eclfiles, vectors="PORO")
     assert "I" in grid_df
@@ -305,7 +332,7 @@ def test_df():
     assert "I" in grid_df
     assert "PORO" in grid_df
     assert "DATE" not in grid_df
-    # (no RST columns, so no DATE info in the daaframe)
+    # (no RST columns, so no DATE info in the dataframe)
     # (warnings should be printed)
 
     grid_df = grid.df(eclfiles, vectors="PORO", rstdates="all", stackdates=True)
@@ -403,9 +430,9 @@ def test_get_available_rst_dates():
 def test_rst2df():
     """Test producing dataframes from restart files"""
     eclfiles = EclFiles(DATAFILE)
-    assert grid.rst2df(eclfiles, "first").shape == (35817, 23)
-    assert grid.rst2df(eclfiles, "last").shape == (35817, 23)
-    assert grid.rst2df(eclfiles, "all").shape == (35817, 23 * 4)
+    assert grid.rst2df(eclfiles, "first").shape == (35817, 24)
+    assert grid.rst2df(eclfiles, "last").shape == (35817, 24)
+    assert grid.rst2df(eclfiles, "all").shape == (35817, 23 * 4 + 1)
 
     assert "SOIL" in grid.rst2df(eclfiles, date="first", dateinheaders=False)
     assert (
@@ -417,11 +444,21 @@ def test_rst2df():
     assert rst_df["DATE"].unique()[0] == "2000-01-01"
     rst_df = grid.rst2df(eclfiles, "all", stackdates=True)
     assert len(rst_df["DATE"].unique()) == len(grid.get_available_rst_dates(eclfiles))
-    assert rst_df.shape == (4 * 35817, 23 + 1)  # "DATE" is now the extra column
+
+    # "DATE" and "active" are now the extra columns:
+    assert rst_df.shape == (4 * 35817, 23 + 2)
+
+    # Test that only the PPCW column contains NaN's (only defined for selected cells)
+    nancols = rst_df.isna().any()
+    assert nancols["PPCW"]
+    assert (
+        len(rst_df[["PPCW", "DATE"]].dropna()["DATE"].unique()) == 4
+    )  # All dates present
+    assert sum(nancols) == 1  # All other columns are "False"
 
     # Check vector slicing:
     rst_df = grid.rst2df(eclfiles, "first", vectors="S???")
-    assert rst_df.shape == (35817, 3)
+    assert rst_df.shape == (35817, 4)
     assert "SGAS" in rst_df
     assert "SWAT" in rst_df
     assert "SOIL" in rst_df  # This is actually computed