Add system tests for API Security Custom Data Classification (RFC-0980) (#5468)

jandro996 · web-flow · commit 873324818732 · 2025-10-14T09:46:37.000+02:00
Add RFC-0980 system tests:

processor overrides v2 (include/exclude)
custom scanners with user-defined tags
RC capabilities 16/17
schema extraction validation
diff --git a/manifests/cpp_nginx.yml b/manifests/cpp_nginx.yml
@@ -19,6 +19,12 @@ tests/:
         Test_API_Security_RC_ASM_DD_processors: v1.8.0
         Test_API_Security_RC_ASM_DD_scanners: v1.8.0
       test_apisec_sampling.py: "irrelevant (sampling works differently in proxies: RFC 1035)"
+      test_custom_data_classification.py:
+        Test_API_Security_Custom_Data_Classification_Capabilities: missing_feature
+        Test_API_Security_Custom_Data_Classification_Multiple_Scanners: missing_feature
+        Test_API_Security_Custom_Data_Classification_Negative: missing_feature
+        Test_API_Security_Custom_Data_Classification_Processor_Override: missing_feature
+        Test_API_Security_Custom_Data_Classification_Scanner: missing_feature
       test_endpoint_discovery.py: irrelevant (not applicable to proxies)
       test_schemas.py:
         Test_Scanners: v1.8.0
diff --git a/manifests/dotnet.yml b/manifests/dotnet.yml
@@ -16,6 +16,12 @@ tests/:
         Test_API_Security_Sampling_Different_Status: v2.46.0
         Test_API_Security_Sampling_Rate: v2.46.0
         Test_API_Security_Sampling_With_Delay: v2.50.0
+      test_custom_data_classification.py:
+        Test_API_Security_Custom_Data_Classification_Capabilities: missing_feature
+        Test_API_Security_Custom_Data_Classification_Multiple_Scanners: missing_feature
+        Test_API_Security_Custom_Data_Classification_Negative: missing_feature
+        Test_API_Security_Custom_Data_Classification_Processor_Override: missing_feature
+        Test_API_Security_Custom_Data_Classification_Scanner: missing_feature
       test_endpoint_discovery.py:
         Test_Endpoint_Discovery: v3.24.0
       test_schemas.py:
diff --git a/manifests/golang.yml b/manifests/golang.yml
@@ -23,6 +23,12 @@ tests/:
           net-http: irrelevant (net-http doesn't handle path params)
           net-http-orchestrion: irrelevant (net-http doesn't handle path params)
         Test_API_Security_Sampling_With_Delay: missing_feature
+      test_custom_data_classification.py:
+        Test_API_Security_Custom_Data_Classification_Capabilities: missing_feature
+        Test_API_Security_Custom_Data_Classification_Multiple_Scanners: missing_feature
+        Test_API_Security_Custom_Data_Classification_Negative: missing_feature
+        Test_API_Security_Custom_Data_Classification_Processor_Override: missing_feature
+        Test_API_Security_Custom_Data_Classification_Scanner: missing_feature
       test_endpoint_discovery.py:
         Test_Endpoint_Discovery: missing_feature
       test_schemas.py:
diff --git a/manifests/java.yml b/manifests/java.yml
@@ -70,6 +70,22 @@ tests/:
           spring-boot-3-native: irrelevant (GraalVM. Tracing support only)
           vertx3: v1.51.0
           vertx4: v1.51.0
+      test_custom_data_classification.py:
+        Test_API_Security_Custom_Data_Classification_Capabilities:
+          '*': missing_feature
+          spring-boot-3-native: irrelevant (GraalVM. Tracing support only)
+        Test_API_Security_Custom_Data_Classification_Multiple_Scanners:
+          '*': missing_feature
+          spring-boot-3-native: irrelevant (GraalVM. Tracing support only)
+        Test_API_Security_Custom_Data_Classification_Negative:
+          '*': missing_feature
+          spring-boot-3-native: irrelevant (GraalVM. Tracing support only)
+        Test_API_Security_Custom_Data_Classification_Processor_Override:
+          '*': missing_feature
+          spring-boot-3-native: irrelevant (GraalVM. Tracing support only)
+        Test_API_Security_Custom_Data_Classification_Scanner:
+          '*': missing_feature
+          spring-boot-3-native: irrelevant (GraalVM. Tracing support only)
       test_endpoint_discovery.py:
         Test_Endpoint_Discovery:
           '*': missing_feature
diff --git a/manifests/nodejs.yml b/manifests/nodejs.yml
@@ -90,6 +90,12 @@ tests/:
         Test_API_Security_Sampling_Different_Status: *ref_5_27_0
         Test_API_Security_Sampling_Rate: irrelevant (new api security sampling algorithm implemented)
         Test_API_Security_Sampling_With_Delay: *ref_5_27_0
+      test_custom_data_classification.py:
+        Test_API_Security_Custom_Data_Classification_Capabilities: missing_feature
+        Test_API_Security_Custom_Data_Classification_Multiple_Scanners: missing_feature
+        Test_API_Security_Custom_Data_Classification_Negative: missing_feature
+        Test_API_Security_Custom_Data_Classification_Processor_Override: missing_feature
+        Test_API_Security_Custom_Data_Classification_Scanner: missing_feature
       test_endpoint_discovery.py:
         Test_Endpoint_Discovery:
           '*': missing_feature
diff --git a/manifests/php.yml b/manifests/php.yml
@@ -16,6 +16,12 @@ tests/:
         Test_API_Security_Sampling_Different_Status: v1.11.0
         Test_API_Security_Sampling_Rate: irrelevant (new sampling algorithm implemented)
         Test_API_Security_Sampling_With_Delay: v1.11.0
+      test_custom_data_classification.py:
+        Test_API_Security_Custom_Data_Classification_Capabilities: missing_feature
+        Test_API_Security_Custom_Data_Classification_Multiple_Scanners: missing_feature
+        Test_API_Security_Custom_Data_Classification_Negative: missing_feature
+        Test_API_Security_Custom_Data_Classification_Processor_Override: missing_feature
+        Test_API_Security_Custom_Data_Classification_Scanner: missing_feature
       test_endpoint_discovery.py:
         Test_Endpoint_Discovery: missing_feature
       test_schemas.py:
diff --git a/manifests/python.yml b/manifests/python.yml
@@ -16,6 +16,12 @@ tests/:
         Test_API_Security_Sampling_Different_Status: v2.6.0
         Test_API_Security_Sampling_Rate: irrelevant (new api security sampling algorithm implemented)
         Test_API_Security_Sampling_With_Delay: v2.6.0
+      test_custom_data_classification.py:
+        Test_API_Security_Custom_Data_Classification_Capabilities: missing_feature
+        Test_API_Security_Custom_Data_Classification_Multiple_Scanners: missing_feature
+        Test_API_Security_Custom_Data_Classification_Negative: missing_feature
+        Test_API_Security_Custom_Data_Classification_Processor_Override: missing_feature
+        Test_API_Security_Custom_Data_Classification_Scanner: missing_feature
       test_endpoint_discovery.py:
         Test_Endpoint_Discovery:
           '*': v3.13.0.dev
diff --git a/manifests/ruby.yml b/manifests/ruby.yml
@@ -20,6 +20,12 @@ tests/:
           rack: irrelevant (rack does not have path parameters support)
         Test_API_Security_Sampling_Rate: irrelevant
         Test_API_Security_Sampling_With_Delay: v2.18.0
+      test_custom_data_classification.py:
+        Test_API_Security_Custom_Data_Classification_Capabilities: missing_feature
+        Test_API_Security_Custom_Data_Classification_Multiple_Scanners: missing_feature
+        Test_API_Security_Custom_Data_Classification_Negative: missing_feature
+        Test_API_Security_Custom_Data_Classification_Processor_Override: missing_feature
+        Test_API_Security_Custom_Data_Classification_Scanner: missing_feature
       test_endpoint_discovery.py:
         Test_Endpoint_Discovery:
           "*": v2.22.0.dev
diff --git a/tests/appsec/api_security/test_custom_data_classification.py b/tests/appsec/api_security/test_custom_data_classification.py
@@ -0,0 +1,159 @@
+# Unless explicitly stated otherwise all files in this repository are licensed under the the Apache License Version 2.0.
+# This product includes software developed at Datadog (https://www.datadoghq.com/).
+# Copyright 2021 Datadog, Inc.
+
+from utils import interfaces, rfc, scenarios, weblog, features, logger
+from utils.dd_constants import Capabilities
+
+from tests.appsec.api_security.utils import BaseAppsecApiSecurityRcTest
+
+
+def get_schema(request, address):
+    """Get api security schema from spans"""
+    for _, _, span in interfaces.library.get_spans(request):
+        meta = span.get("meta", {})
+        key = "_dd.appsec.s." + address
+        payload = meta.get(key)
+        if payload is not None:
+            return payload
+        else:
+            logger.info(f"Schema not found in span meta for {key}")
+    return None
+
+
+@rfc("https://docs.google.com/document/d/1wBrd-ShGoA9-aP96o0VIe46eBgw73GL1315R8QjuMoc/edit?tab=t.0")
+@scenarios.appsec_api_security_rc
+@features.api_security_configuration
+class Test_API_Security_Custom_Data_Classification_Capabilities(BaseAppsecApiSecurityRcTest):
+    """Validate that ASM_PROCESSOR_OVERRIDES and ASM_CUSTOM_DATA_SCANNERS capabilities are exposed"""
+
+    def setup_capabilities_check(self):
+        """Setup for capabilities validation"""
+        self.setup_scenario()
+
+    def test_capabilities_check(self):
+        """Verify both ASM_PROCESSOR_OVERRIDES and ASM_CUSTOM_DATA_SCANNERS capabilities"""
+        # Verify capability 16: ASM_PROCESSOR_OVERRIDES
+        interfaces.library.assert_rc_capability(Capabilities.ASM_PROCESSOR_OVERRIDES)
+
+        # Verify capability 17: ASM_CUSTOM_DATA_SCANNERS
+        interfaces.library.assert_rc_capability(Capabilities.ASM_CUSTOM_DATA_SCANNERS)
+
+
+@rfc("https://docs.google.com/document/d/1wBrd-ShGoA9-aP96o0VIe46eBgw73GL1315R8QjuMoc/edit?tab=t.0")
+@scenarios.appsec_api_security_rc
+@features.api_security_configuration
+class Test_API_Security_Custom_Data_Classification_Processor_Override(BaseAppsecApiSecurityRcTest):
+    """Test API Security - Custom Data Classification with Processor Override"""
+
+    def setup_request_method(self):
+        """Test that processor overrides work correctly with custom scanners"""
+        self.setup_scenario()
+        self.request = weblog.get("/tag_value/api_rc_processor/200?testcard=1234567890")
+
+    def test_request_method(self):
+        """Verify custom scanner detects data based on processor override configuration"""
+        schema = get_schema(self.request, "req.querytest")
+        assert self.request.status_code == 200
+        assert schema is not None, "Schema should be present in the span"
+        assert isinstance(schema, list), "Schema should be a list"
+
+        # Verify that the custom scanner detected the testcard parameter
+        if len(schema) > 0:
+            assert "testcard" in schema[0], "testcard parameter should be in the schema"
+
+
+@rfc("https://docs.google.com/document/d/1wBrd-ShGoA9-aP96o0VIe46eBgw73GL1315R8QjuMoc/edit?tab=t.0")
+@scenarios.appsec_api_security_rc
+@features.api_security_configuration
+class Test_API_Security_Custom_Data_Classification_Scanner(BaseAppsecApiSecurityRcTest):
+    """Test API Security - Custom Data Classification with Custom Scanner"""
+
+    def setup_request_method(self):
+        """Test that custom scanners work correctly for request body"""
+        self.setup_scenario()
+        self.request = weblog.post("/tag_value/api_rc_scanner/200", data={"testcard": "1234567890"})
+
+    def test_request_method(self):
+        """Verify custom scanner detects and classifies sensitive data in request body"""
+        schema = get_schema(self.request, "req.bodytest")
+        assert self.request.status_code == 200
+        assert schema is not None, "Schema should be present in the span"
+        assert isinstance(schema, list), "Schema should be a list"
+
+        # Verify that the custom scanner detected the testcard field
+        if len(schema) > 0:
+            assert "testcard" in schema[0], "testcard field should be in the schema"
+            # Check if the value was classified with custom tags
+            # Structure: schema[0]["testcard"] = [[[value_length, classification]], metadata]
+            if isinstance(schema[0]["testcard"], list) and len(schema[0]["testcard"]) > 0:
+                values = schema[0]["testcard"][0]
+                if isinstance(values, list) and len(values) > 0 and isinstance(values[0], list):
+                    if len(values[0]) > 1:
+                        classification = values[0][1]
+                        assert isinstance(classification, dict), "Classification should be a dict"
+                        assert "category" in classification, "Classification should include category"
+                        assert classification["category"] == "testcategory", "Category should be testcategory"
+                        assert "type" in classification, "Classification should include type"
+                        assert classification["type"] == "card", "Type should be card"
+
+
+@rfc("https://docs.google.com/document/d/1wBrd-ShGoA9-aP96o0VIe46eBgw73GL1315R8QjuMoc/edit?tab=t.0")
+@scenarios.appsec_api_security_rc
+@features.api_security_configuration
+class Test_API_Security_Custom_Data_Classification_Multiple_Scanners(BaseAppsecApiSecurityRcTest):
+    """Test API Security - Multiple Custom Scanners"""
+
+    def setup_request_method(self):
+        """Test that multiple scanners work together correctly"""
+        self.setup_scenario()
+        self.request = weblog.post(
+            "/tag_value/api_rc_scanner/200", data={"mail": "systemtestmail@datadoghq.com", "testcard": "1234567890"}
+        )
+
+    def test_request_method(self):
+        """Verify both standard and custom scanners detect their respective data"""
+        schema = get_schema(self.request, "req.bodytest")
+        assert self.request.status_code == 200
+        assert schema is not None, "Schema should be present in the span"
+        assert isinstance(schema, list), "Schema should be a list"
+
+        if len(schema) > 0:
+            # Check for email detection by standard scanner
+            assert "mail" in schema[0], "mail field should be in the schema"
+            # Check for testcard detection by custom scanner
+            assert "testcard" in schema[0], "testcard field should be in the schema"
+
+
+@rfc("https://docs.google.com/document/d/1wBrd-ShGoA9-aP96o0VIe46eBgw73GL1315R8QjuMoc/edit?tab=t.0")
+@scenarios.appsec_api_security_rc
+@features.api_security_configuration
+class Test_API_Security_Custom_Data_Classification_Negative(BaseAppsecApiSecurityRcTest):
+    """Test API Security - Custom Data Classification Negative Cases"""
+
+    def setup_request_method(self):
+        """Test that data not matching scanner patterns is not classified"""
+        self.setup_scenario()
+        self.request = weblog.post("/tag_value/api_rc_scanner/200", data={"normalfield": "normalvalue"})
+
+    def test_request_method(self):
+        """Verify that normal data without sensitive patterns is not over-classified"""
+        schema = get_schema(self.request, "req.bodytest")
+        assert self.request.status_code == 200
+        assert schema is not None, "Schema should be present in the span"
+
+        # The schema should exist but the field should not be classified as sensitive
+        if len(schema) > 0 and "normalfield" in schema[0]:
+            field_data = schema[0]["normalfield"]
+            # If it's classified, it should not have sensitive category tags
+            # Structure: field_data = [[[value_length, classification]], metadata]
+            if isinstance(field_data, list) and len(field_data) > 0:
+                values = field_data[0]
+                if isinstance(values, list) and len(values) > 0 and isinstance(values[0], list):
+                    if len(values[0]) > 1:
+                        classification = values[0][1]
+                        if isinstance(classification, dict) and "category" in classification:
+                            assert classification["category"] not in [
+                                "pii",
+                                "testcategory",
+                            ], "Normal fields should not be classified as sensitive"
diff --git a/tests/appsec/api_security/utils.py b/tests/appsec/api_security/utils.py
@@ -10,10 +10,16 @@ def setup_scenario(self) -> None:
             rc_state.set_config(
                 "datadog/2/ASM/ASM-base/config",
                 {
-                    "processor_override": [
-                        {"target": ["extract-content"], "scanners": ["test-scanner-002", "test-scanner-custom-001"]}
+                    "processor_overrides": [
+                        {
+                            "target": [{"id": "extract-content"}],
+                            "scanners": {
+                                "include": [{"id": "test-scanner-001"}, {"id": "test-scanner-custom-001"}],
+                                "exclude": [],
+                            },
+                        }
                     ],
-                    "custom_scanners": [
+                    "scanners": [
                         {
                             "id": "test-scanner-custom-001",
                             "name": "Custom scanner",