Signet Forge 0.1.1
C++20 Parquet library with AI-native extensions
DEMO
Loading...
Searching...
No Matches
data_classification.hpp
Go to the documentation of this file.
1// SPDX-License-Identifier: AGPL-3.0-or-later
2// Copyright 2026 Johnson Ogundeji
3// See LICENSE_COMMERCIAL for full terms.
4#pragma once
5
6#if !defined(SIGNET_ENABLE_COMMERCIAL) || !SIGNET_ENABLE_COMMERCIAL
7#error "signet/ai/data_classification.hpp requires SIGNET_ENABLE_COMMERCIAL=ON (AGPL-3.0 commercial tier). See LICENSE_COMMERCIAL."
8#endif
9
10// ---------------------------------------------------------------------------
11// data_classification.hpp -- Formal Data Classification Ontology
12//
13// Gap G-9: Formal data classification ontology per DORA Art.8 + GDPR Art.32.
14//
15// Provides a structured, machine-readable data classification framework:
16// - DORA Art.8: ICT asset classification (data at rest, in transit)
17// - GDPR Art.9: Special categories of personal data
18// - GDPR Art.32: Appropriate security measures per classification
19// - NIST SP 800-60: Information types and security categorization
20//
21// Components:
22// - DataClassification: 4-tier confidentiality levels
23// - DataSensitivity: GDPR Art.9 special category types
24// - RegulatoryRegime: applicable regulatory frameworks
25// - DataClassificationRule: per-field classification + handling policy
26// - DataClassificationOntology: rule registry with validation
27//
28// Header-only. Part of the signet::forge AI module.
29// ---------------------------------------------------------------------------
30
31#include "signet/error.hpp"
32
33#include <algorithm>
34#include <stdexcept>
35#include <cstdint>
36#include <string>
37#include <unordered_map>
38#include <vector>
39
40namespace signet::forge {
41
42// ---------------------------------------------------------------------------
43// Enumerations
44// ---------------------------------------------------------------------------
45
47enum class DataClassification : int32_t {
48 PUBLIC = 0,
49 INTERNAL = 1,
50 RESTRICTED = 2,
52};
53
55enum class DataSensitivity : int32_t {
56 NEUTRAL = 0,
57 PSEUDONYMISED = 1,
58 ANONYMISED = 2,
59 PII = 3,
60 FINANCIAL_PII = 4,
61 BIOMETRIC = 5,
62 HEALTH = 6
63};
64
66enum class RegulatoryRegime : int32_t {
67 NONE = 0,
68 GDPR = 1,
69 MIFID2 = 2,
70 DORA = 3,
71 EU_AI_ACT = 4,
72 SOX = 5,
73 SEC_17A4 = 6,
74 PCI_DSS = 7,
75 HIPAA = 8
76};
77
78// ---------------------------------------------------------------------------
79// DataClassificationRule
80// ---------------------------------------------------------------------------
81
85 std::string field_name;
89
90 // --- Retention lifecycle ---
91 int64_t min_retention_ns = 0;
92 int64_t max_retention_ns = INT64_C(157788000000000000);
93
94 // --- Processing restrictions ---
95 bool require_encryption = false;
97 bool allow_aggregation = true;
98 bool allow_ml_training = true;
99 bool allow_export = true;
100 bool allow_logging = true;
101
102 // --- Purpose limitation (GDPR Art.5(1)(b)) ---
103 std::vector<std::string> allowed_purposes;
104};
105
106// ---------------------------------------------------------------------------
107// DataClassificationOntology
108// ---------------------------------------------------------------------------
109
115public:
117 explicit DataClassificationOntology(const std::string& ontology_id = "default")
118 : ontology_id_(ontology_id) {
119 auto gate = commercial::require_feature("DataClassificationOntology");
120 if (!gate) throw std::runtime_error(gate.error().message);
121 }
122
125 rules_[rule.field_name] = rule;
126 }
127
130 [[nodiscard]] DataClassificationRule lookup(const std::string& field_name) const {
131 auto it = rules_.find(field_name);
132 if (it != rules_.end()) return it->second;
133 // Unknown fields default to PUBLIC/NEUTRAL (least restrictive) —
134 // callers that need fail-closed semantics should register all fields
135 // explicitly or use validate_handling() with require_encryption=true.
137 dflt.field_name = field_name;
140 return dflt;
141 }
142
144 [[nodiscard]] std::vector<DataClassificationRule> all_rules() const {
145 std::vector<DataClassificationRule> out;
146 out.reserve(rules_.size());
147 for (const auto& [_, r] : rules_) out.push_back(r);
148 return out;
149 }
150
152 [[nodiscard]] size_t size() const { return rules_.size(); }
153
155 [[nodiscard]] const std::string& ontology_id() const { return ontology_id_; }
156
162 const std::string& field_name,
163 bool is_encrypted,
164 bool is_pseudonymised,
165 bool purpose_is_allowed = true) const
166 {
167 auto rule = lookup(field_name);
168
169 // HIGHLY_RESTRICTED or RESTRICTED fields must be encrypted
170 if (rule.require_encryption && !is_encrypted) {
172 "Data classification violation: field '" + field_name +
173 "' requires encryption (classification=" +
174 classification_name(rule.classification) + ")"};
175 }
176
177 // PII fields should be pseudonymised unless explicitly allowed
178 if (rule.sensitivity >= DataSensitivity::PII &&
179 !is_pseudonymised && !rule.allow_logging) {
181 "Data classification violation: field '" + field_name +
182 "' contains sensitive data and must be pseudonymised for logging"};
183 }
184
185 // Purpose limitation check
186 if (!purpose_is_allowed && !rule.allowed_purposes.empty()) {
188 "Data classification violation: field '" + field_name +
189 "' processing purpose not in allowed list (GDPR Art.5(1)(b))"};
190 }
191
192 return {};
193 }
194
197 DataClassificationOntology ont("financial-default");
198
199 // Public data
202 ont.add_rule({"timestamp", DataClassification::PUBLIC,
204
205 // Internal market data
206 {
208 r.field_name = "price";
212 r.min_retention_ns = INT64_C(157788000000000000); // 5y MiFID II
213 ont.add_rule(r);
214 }
215 {
217 r.field_name = "volume";
221 r.min_retention_ns = INT64_C(157788000000000000);
222 ont.add_rule(r);
223 }
224
225 // Restricted trading data
226 {
228 r.field_name = "strategy_id";
232 r.require_encryption = true;
233 r.allow_ml_training = false;
234 r.min_retention_ns = INT64_C(157788000000000000);
235 ont.add_rule(r);
236 }
237
238 // Highly restricted PII
239 {
241 r.field_name = "trader_id";
245 r.require_encryption = true;
246 r.allow_ml_training = false;
247 r.allow_export = false;
248 r.allow_logging = false;
249 r.allowed_purposes = {"compliance-reporting", "regulatory-inquiry"};
250 ont.add_rule(r);
251 }
252
253 // Cryptographic key material
254 {
256 r.field_name = "encryption_key";
260 r.require_encryption = true;
261 r.allow_pseudonymisation = false;
262 r.allow_aggregation = false;
263 r.allow_ml_training = false;
264 r.allow_export = false;
265 r.allow_logging = false;
266 ont.add_rule(r);
267 }
268
269 return ont;
270 }
271
272private:
273 static std::string classification_name(DataClassification c) {
274 switch (c) {
275 case DataClassification::PUBLIC: return "PUBLIC";
276 case DataClassification::INTERNAL: return "INTERNAL";
277 case DataClassification::RESTRICTED: return "RESTRICTED";
278 case DataClassification::HIGHLY_RESTRICTED: return "HIGHLY_RESTRICTED";
279 }
280 return "UNKNOWN";
281 }
282
283 std::string ontology_id_;
284 std::unordered_map<std::string, DataClassificationRule> rules_;
285};
286
287} // namespace signet::forge
A named collection of data classification rules forming a formal ontology.
const std::string & ontology_id() const
Ontology identifier.
DataClassificationRule lookup(const std::string &field_name) const
Look up the classification rule for a field.
DataClassificationOntology(const std::string &ontology_id="default")
Construct an ontology with the given identifier.
expected< void > validate_handling(const std::string &field_name, bool is_encrypted, bool is_pseudonymised, bool purpose_is_allowed=true) const
Validate that a field's actual handling meets classification requirements.
std::vector< DataClassificationRule > all_rules() const
Get all registered rules.
static DataClassificationOntology financial_default()
Build a default ontology with standard financial/compliance field rules.
size_t size() const
Number of registered rules.
void add_rule(const DataClassificationRule &rule)
Add a classification rule for a field.
A lightweight result type that holds either a success value of type T or an Error.
Definition error.hpp:143
RegulatoryRegime
Regulatory regime(s) applicable to the data.
@ SEC_17A4
SEC Rule 17a-4 (records retention)
@ PCI_DSS
Payment Card Industry Data Security Standard.
@ GDPR
EU General Data Protection Regulation.
@ DORA
Digital Operational Resilience Act.
@ MIFID2
Markets in Financial Instruments Directive II.
@ EU_AI_ACT
EU Artificial Intelligence Act.
@ SOX
Sarbanes-Oxley Act.
@ HIPAA
Health Insurance Portability and Accountability Act.
DataClassification
Data confidentiality level per DORA Art.8 + ISO 27001 Annex A.
@ RESTRICTED
Regulated data (GDPR, FCA, MiFID II)
@ INTERNAL
Business-internal, not for external sharing.
@ HIGHLY_RESTRICTED
Cryptographic keys, trading secrets, PII.
@ PUBLIC
No confidentiality requirement.
DataSensitivity
Data sensitivity per GDPR Art.9 special categories.
@ BIOMETRIC
Biometric data (Art.9 special category)
@ ANONYMISED
Irreversibly de-identified (Art.4(1))
@ NEUTRAL
No special sensitivity.
@ FINANCIAL_PII
Financial account data, trading activity.
@ HEALTH
Health/genetic data (Art.9 special category)
@ PII
Personally Identifiable Information.
@ PSEUDONYMISED
Identifiable only with additional key (Art.25)
@ INVALID_ARGUMENT
A caller-supplied argument is outside the valid range or violates a precondition.
Per-field data classification and handling policy.
bool allow_logging
Biometric, health → false in plaintext.
bool require_encryption
RESTRICTED/HIGHLY_RESTRICTED → true.
int64_t max_retention_ns
Max retention (default 5y)
bool allow_export
HIGHLY_RESTRICTED → false.
std::string field_name
Column/field path (e.g., "user.email", "price")
int64_t min_retention_ns
Minimum retention (0 = no min)
bool allow_ml_training
PII, secrets → false.
Lightweight error value carrying an ErrorCode and a human-readable message.
Definition error.hpp:99