Merge pull request #272 from data-engineering-collective/not_in_predicate

fjetter · web-flow · commit d40e32856a21 · 2025-08-22T17:12:21.000+02:00
Add `not in` predicate operation
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -5,6 +5,7 @@ Changelog
 Plateau 4.6.2 (2025-08-XX)
 ==========================
 
+* Add support for `not in` predicate operation.
 * Add further validation for predicates to raise errors if operators are misused with non-scalar values
 
 
diff --git a/plateau/serialization/_generic.py b/plateau/serialization/_generic.py
@@ -206,12 +206,13 @@ def check_predicates(predicates: PredicatesType) -> None:
                     f"with null value and operator '{op}'. Only operators supporting null values "
                     "are '==', '!=', 'in' and 'is distinct from'."
                 )
-            if op == "in" and pd.api.types.is_scalar(val):
-                raise ValueError(
-                    f"Invalid predicates in clause {clause_idx} in conjunction {conjunction_idx} "
-                    f"with operator '{op}' must be used with a tuple or list, got {type(val)} instead."
-                )
-            if op != "in" and is_list_like(val):
+            if op in ("in", "not in"):
+                if pd.api.types.is_scalar(val):
+                    raise ValueError(
+                        f"Invalid predicates in clause {clause_idx} in conjunction {conjunction_idx} "
+                        f"with operator '{op}' must be used with a tuple or list, got {type(val)} instead."
+                    )
+            elif is_list_like(val):
                 raise ValueError(
                     f"Invalid predicates in clause {clause_idx} in conjunction {conjunction_idx} "
                     f"with operator '{op}' must be used with a scalar type, got {type(val)} instead."
@@ -515,7 +516,8 @@ def filter_array_like(
             np.logical_and(array_like < value, mask, out=out)
         elif op == ">":
             np.logical_and(array_like > value, mask, out=out)
-        elif op == "in":
+        elif op in ("in", "not in"):
+            inclusive = op == "in"
             value = np.asarray(value)
             nullmask = pd.isnull(value)
             if value.dtype.kind in ("U", "S", "O"):
@@ -548,11 +550,19 @@ def filter_array_like(
             if any(nullmask):
                 matching_idx |= pd.isnull(array_like)
 
-            np.logical_and(
-                matching_idx,
-                mask,
-                out=out,
-            )
+            if inclusive:
+                np.logical_and(
+                    matching_idx,
+                    mask,
+                    out=out,
+                )
+            else:
+                np.logical_and(
+                    ~matching_idx,
+                    mask,
+                    out=out,
+                )
+
         else:
             raise NotImplementedError("op not supported")
 
diff --git a/plateau/serialization/_parquet.py b/plateau/serialization/_parquet.py
@@ -582,6 +582,24 @@ def _predicate_accepts(predicate, row_meta, arrow_schema, parquet_reader):
             elif min_value <= x <= max_value:
                 return True
         return False
+    elif op == "not in":
+        # The only way we could exclude a row group was if we knew that all
+        # elements in the row group were listed in the values
+        # The only situations we can tell for sure what the content of the row group is iff
+        # min_value == max_value
+        # or null_count == len
+
+        if min_value == max_value:
+            for v in val:
+                if pd.isnull(v):
+                    if parquet_statistics.null_count > 0:
+                        continue
+                elif v == min_value:
+                    continue
+                break
+            else:
+                return False
+        return True
     else:
         raise NotImplementedError("op not supported")
 
diff --git a/tests/io_components/test_read.py b/tests/io_components/test_read.py
@@ -36,6 +36,14 @@ def test_dispatch_metapartitions(dataset, store_session):
             [[("mycol", "in", "scalar")]],
             "operator 'in' must be used with a tuple or list",
         ),
+        (
+            [[("mycol", "not in", None)]],
+            "Invalid predicates: Clause 0 in conjunction 0 with null value and operator 'not in'.",
+        ),
+        (
+            [[("mycol", "not in", "scalar")]],
+            "operator 'not in' must be used with a tuple or list",
+        ),
         ([[("mycol", "<", [17, 12])]], "operator '<' must be used with a scalar type"),
     ],
 )
diff --git a/tests/serialization/test_filter.py b/tests/serialization/test_filter.py
@@ -108,14 +108,14 @@ def test_filter_array_like_categoricals(op, expected, cat_type):
         pytest.param([True], True, marks=pytest.mark.xfail(reason="see gh-193")),
     ],
 )
-@pytest.mark.parametrize("op", ["==", "!=", "<", "<=", ">", ">=", "in"])
+@pytest.mark.parametrize("op", ["==", "!=", "<", "<=", ">", ">=", "in", "not in"])
 def test_raise_on_type(value, filter_value, op):
     array_like = pd.Series([value])
     with pytest.raises(TypeError, match="Unexpected type for predicate:"):
         filter_array_like(array_like, op, filter_value, strict_date_types=True)
 
 
-@pytest.mark.parametrize("op", ["==", "!=", ">=", "<=", ">", "<", "in"])
+@pytest.mark.parametrize("op", ["==", "!=", ">=", "<=", ">", "<", "in", "not in"])
 @pytest.mark.parametrize(
     "data,value",
     [
@@ -170,7 +170,7 @@ def test_filter_df_from_predicates(op, data, value):
     if isinstance(df["A"].dtype, pd.CategoricalDtype):
         df["A"] = df["A"].astype(df["A"].cat.as_ordered().dtype)
 
-    if op == "in":
+    if op in ["in", "not in"]:
         value = [value]
 
     predicates = [[("A", op, value)]]
@@ -181,6 +181,8 @@ def test_filter_df_from_predicates(op, data, value):
     value = pd.Series(value, dtype=df["A"].dtype).iloc[0]
     if op == "in":
         expected = df[df["A"].isin([value])]
+    elif op == "not in":
+        expected = df[~df["A"].isin([value])]
     else:
         expected = eval(f"df[df['A'] {op} value]")
     pdt.assert_frame_equal(actual, expected, check_categorical=False)
diff --git a/tests/serialization/test_parquet.py b/tests/serialization/test_parquet.py
@@ -413,6 +413,66 @@ def test_predicate_accept_in(store, predicate_value, expected):
     )
 
 
+@pytest.mark.parametrize(
+    ["predicate_value", "expected"],
+    [
+        ([0, 4, 1], True),
+        ([-2, 44], True),
+        ([-3, 0], True),
+        ([-1, 10**4], True),
+        ([2, 3], True),
+        ([-1, 20], True),
+        ([-30, -5, 50, 10], True),
+        ([-30, -5, 50, np.nan], True),
+        ([], True),
+    ],
+)
+def test_predicate_accept_notin(store, predicate_value, expected):
+    df = pd.DataFrame({"A": [0, 4, 13, 29]})  # min = 0, max = 29
+    predicate = ("A", "not in", predicate_value)
+    serialiser = ParquetSerializer(chunk_size=None)
+    key = serialiser.store(store, "prefix", df)
+
+    parquet_file = ParquetFile(store.open(key))
+    row_meta = parquet_file.metadata.row_group(0)
+    arrow_schema = parquet_file.schema.to_arrow_schema()
+    parquet_reader = parquet_file.reader
+    assert (
+        _predicate_accepts(
+            predicate,
+            row_meta=row_meta,
+            arrow_schema=arrow_schema,
+            parquet_reader=parquet_reader,
+        )
+        == expected
+    )
+
+
+@pytest.mark.parametrize(
+    ["predicate_value", "test_data"],
+    [
+        ([0], [0, 0]),
+        ([0, np.nan], [0, 0, np.nan]),
+    ],
+)
+def test_predicate_accept_notin_excludes(store, predicate_value, test_data):
+    df = pd.DataFrame({"A": test_data})  # min = 0, max = 29
+    predicate = ("A", "not in", predicate_value)
+    serialiser = ParquetSerializer(chunk_size=None)
+    key = serialiser.store(store, "prefix", df)
+
+    parquet_file = ParquetFile(store.open(key))
+    row_meta = parquet_file.metadata.row_group(0)
+    arrow_schema = parquet_file.schema.to_arrow_schema()
+    parquet_reader = parquet_file.reader
+    assert not _predicate_accepts(
+        predicate,
+        row_meta=row_meta,
+        arrow_schema=arrow_schema,
+        parquet_reader=parquet_reader,
+    )
+
+
 def test_read_categorical(store):
     df = pd.DataFrame({"col": ["a"]}).astype({"col": "category"})