Signet Forge 0.1.0
C++20 Parquet library with AI-native extensions
DEMO
Loading...
Searching...
No Matches
key_metadata.hpp
Go to the documentation of this file.
1// SPDX-License-Identifier: AGPL-3.0-or-later
2// Copyright 2026 Johnson Ogundeji
3#pragma once
4
8
9// ---------------------------------------------------------------------------
10// key_metadata.hpp -- Key material and encryption metadata for Parquet
11// Modular Encryption (PME)
12//
13// Defines the structures needed to configure, store, and retrieve encryption
14// key material for Parquet files. Supports two key modes:
15//
16// INTERNAL -- Key material is stored directly in file metadata. Suitable
17// for testing, development, and self-contained encrypted files.
18//
19// EXTERNAL -- Only a KMS key identifier is stored; the actual key material
20// is retrieved from an external Key Management Service at
21// read time.
22//
23// Serialization uses a simple binary TLV (tag-length-value) format:
24// [4-byte LE tag] [4-byte LE length] [data bytes]
25// for each field. This is intentionally simple and deterministic -- no
26// alignment padding, no variable-length integers.
27//
28// Tags used:
29// 0x0001 key_mode (4 bytes, int32_t LE)
30// 0x0002 key_material (variable length blob)
31// 0x0003 key_id (variable length UTF-8 string)
32// 0x0010 algorithm (4 bytes, int32_t LE)
33// 0x0011 footer_encrypted (1 byte, 0/1)
34// 0x0012 aad_prefix (variable length UTF-8 string)
35// ---------------------------------------------------------------------------
36
37#include "signet/error.hpp"
39
40#include <cstddef>
41#include <cstdint>
42#include <cstdio>
43#include <cstring>
44#include <memory>
45#include <stdexcept>
46#include <string>
47#include <unordered_map>
48#include <unordered_set>
49#include <vector>
50
51namespace signet::forge::crypto {
52
53// ===========================================================================
54// Enumerations
55// ===========================================================================
56
58enum class KeyMode : int32_t {
59 INTERNAL = 0,
60 EXTERNAL = 1
61};
62
63// ===========================================================================
64// IKmsClient -- Abstract KMS client interface (Gap P-5)
65//
66// Implements the two-tier key hierarchy defined by the Parquet Modular
67// Encryption specification (PARQUET-1178 §3):
68//
69// KEK (Key Encryption Key) -- Master key held in the KMS (AWS KMS,
70// Azure Key Vault, HashiCorp Vault, GCP Cloud KMS, etc.).
71// Never leaves the KMS boundary.
72//
73// DEK (Data Encryption Key) -- Per-file or per-column AES-256 key.
74// Generated locally, wrapped (encrypted) by the KEK, and stored
75// in the Parquet file metadata as an opaque blob.
76//
77// On write, the caller generates a random DEK and calls wrap_key()
78// to encrypt it under a KEK identified by key_id. The wrapped DEK
79// is stored alongside the key_id in the file metadata.
80//
81// On read, the caller retrieves the wrapped DEK from file metadata
82// and calls unwrap_key() with the same key_id to recover the DEK.
83//
84// Implementations should be thread-safe (concurrent wrap/unwrap
85// from multiple FileEncryptor/FileDecryptor instances).
86//
87// References:
88// - Apache Parquet Modular Encryption (PARQUET-1178) §3
89// - NIST SP 800-57 Part 1 Rev. 5 §5.3 (key hierarchy)
90// - NIST SP 800-38F (AES Key Wrap)
91// ===========================================================================
92
98public:
99 virtual ~IKmsClient() = default;
100
107 const std::vector<uint8_t>& dek,
108 const std::string& master_key_id) const = 0;
109
116 const std::vector<uint8_t>& wrapped_dek,
117 const std::string& master_key_id) const = 0;
118};
119
128enum class EncryptionAlgorithm : int32_t {
129 AES_GCM_V1 = 0,
130 AES_GCM_CTR_V1 = 1
131};
132
133// ===========================================================================
134// ColumnKeySpec -- Per-column encryption key specification
135// ===========================================================================
136
143 std::string column_name;
144 std::vector<uint8_t> key;
145 std::string key_id;
146};
147
148// ===========================================================================
149// EncryptionConfig -- Complete encryption configuration for a Parquet file
150// ===========================================================================
151
160
161 // --- Footer encryption ---------------------------------------------------
162
164 std::vector<uint8_t> footer_key;
165
167 std::string footer_key_id;
168
171 bool encrypt_footer = true;
172
173 // --- Column encryption ---------------------------------------------------
174
176 std::vector<ColumnKeySpec> column_keys;
177
181 std::vector<uint8_t> default_column_key;
182
185
186 // --- Key mode ------------------------------------------------------------
187
190
191 // --- KMS client (Gap P-5) ------------------------------------------------
192
201 std::shared_ptr<IKmsClient> kms_client;
202
203 // --- AAD (Additional Authenticated Data) ---------------------------------
204
208 std::string aad_prefix;
209
210 // --- AAD format (Gap P-4) ------------------------------------------------
211
226 enum class AadFormat : int32_t {
227 LEGACY = 0,
228 SPEC_BINARY = 1,
229 };
231};
232
233// ===========================================================================
234// Serialization helpers (TLV: tag-length-value, all little-endian)
235// ===========================================================================
236namespace detail::meta {
237
239inline void write_le32(uint8_t* dst, uint32_t val) {
240 dst[0] = static_cast<uint8_t>(val);
241 dst[1] = static_cast<uint8_t>(val >> 8);
242 dst[2] = static_cast<uint8_t>(val >> 16);
243 dst[3] = static_cast<uint8_t>(val >> 24);
244}
245
247inline uint32_t read_le32(const uint8_t* src) {
248 return static_cast<uint32_t>(src[0])
249 | (static_cast<uint32_t>(src[1]) << 8)
250 | (static_cast<uint32_t>(src[2]) << 16)
251 | (static_cast<uint32_t>(src[3]) << 24);
252}
253
255inline constexpr uint32_t MAX_TLV_LENGTH = 64u * 1024u * 1024u;
256
258inline constexpr size_t MAX_METADATA_SIZE = 1024 * 1024;
259
261inline void append_tlv(std::vector<uint8_t>& buf,
262 uint32_t tag,
263 const uint8_t* data, uint32_t len) {
264 size_t pos = buf.size();
265 buf.resize(pos + 8 + len);
266 write_le32(buf.data() + pos, tag);
267 write_le32(buf.data() + pos + 4, len);
268 if (len > 0) {
269 std::memcpy(buf.data() + pos + 8, data, len);
270 }
271}
272
274inline void append_tlv_i32(std::vector<uint8_t>& buf,
275 uint32_t tag, int32_t val) {
276 uint8_t tmp[4];
277 write_le32(tmp, static_cast<uint32_t>(val));
278 append_tlv(buf, tag, tmp, 4);
279}
280
282inline void append_tlv_u8(std::vector<uint8_t>& buf,
283 uint32_t tag, uint8_t val) {
284 append_tlv(buf, tag, &val, 1);
285}
286
288inline void append_tlv_str(std::vector<uint8_t>& buf,
289 uint32_t tag, const std::string& s) {
290 if (s.size() > MAX_TLV_LENGTH) {
291 throw std::overflow_error("TLV value exceeds maximum length");
292 }
293 append_tlv(buf, tag,
294 reinterpret_cast<const uint8_t*>(s.data()),
295 static_cast<uint32_t>(s.size()));
296}
297
299inline void append_tlv_blob(std::vector<uint8_t>& buf,
300 uint32_t tag,
301 const std::vector<uint8_t>& blob) {
302 if (blob.size() > MAX_TLV_LENGTH) {
303 throw std::overflow_error("TLV value exceeds maximum length");
304 }
305 append_tlv(buf, tag, blob.data(), static_cast<uint32_t>(blob.size()));
306}
307
309struct TlvField {
310 uint32_t tag;
311 const uint8_t* data;
312 uint32_t length;
313};
314
323inline bool read_tlv(const uint8_t* buf, size_t buf_size,
324 size_t& offset, TlvField& field) {
325 if (offset + 8 > buf_size) return false;
326 field.tag = read_le32(buf + offset);
327 field.length = read_le32(buf + offset + 4);
328 // Guard: reject oversized TLV fields
329 if (field.length > MAX_TLV_LENGTH) return false;
330 // Overflow-safe bounds check: use subtraction instead of addition
331 size_t remaining = buf_size - (offset + 8);
332 if (field.length > remaining) return false;
333 field.data = buf + offset + 8;
334 offset += 8 + field.length;
335 return true;
336}
337
339inline bool tlv_to_i32(const TlvField& field, int32_t& out) {
340 if (field.length != 4) return false;
341 out = static_cast<int32_t>(read_le32(field.data));
342 return true;
343}
344
346inline bool tlv_to_u8(const TlvField& field, uint8_t& out) {
347 if (field.length != 1) return false;
348 out = field.data[0];
349 return true;
350}
351
353inline std::string tlv_to_str(const TlvField& field) {
354 return std::string(reinterpret_cast<const char*>(field.data), field.length);
355}
356
358inline std::vector<uint8_t> tlv_to_blob(const TlvField& field) {
359 return std::vector<uint8_t>(field.data, field.data + field.length);
360}
361
364inline constexpr uint32_t TAG_KEY_MODE = 0x0001;
365inline constexpr uint32_t TAG_KEY_MATERIAL = 0x0002;
366inline constexpr uint32_t TAG_KEY_ID = 0x0003;
367inline constexpr uint32_t TAG_ALGORITHM = 0x0010;
368inline constexpr uint32_t TAG_FOOTER_ENCRYPTED = 0x0011;
369inline constexpr uint32_t TAG_AAD_PREFIX = 0x0012;
371
372} // namespace detail::meta
373
374// ===========================================================================
375// EncryptionKeyMetadata -- Key material stored in file/column metadata
376// ===========================================================================
377
387 std::vector<uint8_t> key_material;
388 std::string key_id;
389
392 [[nodiscard]] std::vector<uint8_t> serialize() const {
393 using namespace detail::meta;
394
395 std::vector<uint8_t> buf;
396 buf.reserve(64);
397
398 // Key mode (always present)
399 append_tlv_i32(buf, TAG_KEY_MODE, static_cast<int32_t>(key_mode));
400
401 // Key material (INTERNAL) or key ID (EXTERNAL)
402 if (key_mode == KeyMode::INTERNAL && !key_material.empty()) {
403#ifdef SIGNET_PRODUCTION_MODE
404 // CR-1 / FIPS 140-3 §7.7: INTERNAL key mode is forbidden in production
405 // builds — raw AES key material must never be serialized into metadata.
406 throw std::runtime_error(
407 "KeyMode::INTERNAL is disabled in production builds "
408 "(SIGNET_PRODUCTION_MODE). Use EXTERNAL key management.");
409#endif
410#ifndef SIGNET_SUPPRESS_INTERNAL_KEY_WARNING
411 static bool warned = false;
412 if (!warned) {
413 fprintf(stderr, "[SIGNET WARNING] KeyMode::INTERNAL stores encryption key in file metadata — NOT for production use\n");
414 warned = true;
415 }
416#endif
417 append_tlv_blob(buf, TAG_KEY_MATERIAL, key_material);
418 }
419 if (!key_id.empty()) {
420 append_tlv_str(buf, TAG_KEY_ID, key_id);
421 }
422
423 return buf;
424 }
425
431 const uint8_t* data, size_t size) {
432
433 using namespace detail::meta;
434
435 if (size > MAX_METADATA_SIZE) {
437 "key metadata exceeds 1 MB limit (CWE-770)"};
438 }
439
441 size_t offset = 0;
442 bool found_mode = false;
443
444 while (offset < size) {
445 TlvField field;
446 if (!read_tlv(data, size, offset, field)) {
448 "EncryptionKeyMetadata: truncated TLV field"};
449 }
450
451 switch (field.tag) {
452 case TAG_KEY_MODE: {
453 int32_t mode_val;
454 if (!tlv_to_i32(field, mode_val)) {
456 "EncryptionKeyMetadata: invalid key_mode field"};
457 }
458 if (mode_val < 0 || mode_val > 1) {
460 "EncryptionKeyMetadata: invalid KeyMode value"};
461 }
462 meta.key_mode = static_cast<KeyMode>(mode_val);
463 found_mode = true;
464 break;
465 }
466 case TAG_KEY_MATERIAL:
467 meta.key_material = tlv_to_blob(field);
468 break;
469 case TAG_KEY_ID:
470 meta.key_id = tlv_to_str(field);
471 break;
472 default:
473 // Unknown tags are silently skipped for forward compatibility
474 break;
475 }
476 }
477
478 if (!found_mode) {
480 "EncryptionKeyMetadata: missing key_mode field"};
481 }
482
483 return meta;
484 }
485};
486
487// ===========================================================================
488// FileEncryptionProperties -- File-level encryption metadata
489// ===========================================================================
490
497 bool footer_encrypted = true;
498 std::string aad_prefix;
499
502 [[nodiscard]] std::vector<uint8_t> serialize() const {
503 using namespace detail::meta;
504
505 std::vector<uint8_t> buf;
506 buf.reserve(64);
507
508 append_tlv_i32(buf, TAG_ALGORITHM, static_cast<int32_t>(algorithm));
509 append_tlv_u8(buf, TAG_FOOTER_ENCRYPTED, footer_encrypted ? 1 : 0);
510 if (!aad_prefix.empty()) {
511 append_tlv_str(buf, TAG_AAD_PREFIX, aad_prefix);
512 }
513
514 return buf;
515 }
516
522 const uint8_t* data, size_t size) {
523
524 using namespace detail::meta;
525
526 if (size > MAX_METADATA_SIZE) {
528 "key metadata exceeds 1 MB limit (CWE-770)"};
529 }
530
532 size_t offset = 0;
533 bool found_algo = false;
534
535 while (offset < size) {
536 TlvField field;
537 if (!read_tlv(data, size, offset, field)) {
539 "FileEncryptionProperties: truncated TLV field"};
540 }
541
542 switch (field.tag) {
543 case TAG_ALGORITHM: {
544 int32_t algo_val;
545 if (!tlv_to_i32(field, algo_val)) {
547 "FileEncryptionProperties: invalid algorithm field"};
548 }
549 if (algo_val < 0 || algo_val > 1) {
551 "FileEncryptionProperties: invalid EncryptionAlgorithm value"};
552 }
553 props.algorithm = static_cast<EncryptionAlgorithm>(algo_val);
554 found_algo = true;
555 break;
556 }
557 case TAG_FOOTER_ENCRYPTED: {
558 uint8_t val;
559 if (!tlv_to_u8(field, val)) {
561 "FileEncryptionProperties: invalid footer_encrypted field"};
562 }
563 props.footer_encrypted = (val != 0);
564 break;
565 }
566 case TAG_AAD_PREFIX:
567 props.aad_prefix = tlv_to_str(field);
568 break;
569 default:
570 // Unknown tags silently skipped for forward compatibility
571 break;
572 }
573 }
574
575 if (!found_algo) {
577 "FileEncryptionProperties: missing algorithm field"};
578 }
579
580 return props;
581 }
582};
583
584// ===========================================================================
585// Thrift-based serialization (Gap P-6)
586//
587// The Parquet PME spec uses Thrift Compact Protocol for key metadata,
588// specifically the ColumnCryptoMetaData struct. Adding Thrift serialization
589// alongside the existing TLV format enables cross-implementation interop
590// with parquet-mr (Java), pyarrow, and other Parquet implementations.
591//
592// Parquet Thrift schema (from parquet.thrift):
593//
594// struct AesGcmV1 {
595// 1: optional binary aad_prefix
596// 2: optional binary aad_file_unique
597// 3: optional bool supply_aad_prefix
598// }
599// struct AesGcmCtrV1 {
600// 1: optional binary aad_prefix
601// 2: optional binary aad_file_unique
602// 3: optional bool supply_aad_prefix
603// }
604// union EncryptionAlgorithm {
605// 1: AesGcmV1 AES_GCM_V1
606// 2: AesGcmCtrV1 AES_GCM_CTR_V1
607// }
608// struct ColumnCryptoMetaData {
609// 1: required EncryptionAlgorithm ENCRYPTION_WITH_FOOTER_KEY
610// 2: optional binary key_metadata
611// }
612// struct FileCryptoMetaData {
613// 1: required EncryptionAlgorithm encryption_algorithm
614// 2: optional binary key_metadata
615// }
616//
617// Reference: https://github.com/apache/parquet-format/blob/master/src/main/thrift/parquet.thrift
618// ===========================================================================
619
620namespace detail::thrift_crypto {
621
630inline std::vector<uint8_t> serialize_key_metadata(
631 const EncryptionKeyMetadata& meta,
633 const std::string& aad_prefix = "") {
634
636
637 // Field 1: EncryptionAlgorithm (union, encoded as struct)
639 enc.begin_struct();
640 {
641 // Union: field 1 = AES_GCM_V1, field 2 = AES_GCM_CTR_V1
642 int16_t union_field = (algo == EncryptionAlgorithm::AES_GCM_V1) ? 1 : 2;
644 enc.begin_struct();
645 {
646 // AesGcmV1 / AesGcmCtrV1 inner struct
647 if (!aad_prefix.empty()) {
649 enc.write_binary(
650 reinterpret_cast<const uint8_t*>(aad_prefix.data()),
651 aad_prefix.size());
652 }
653 enc.write_stop();
654 }
655 enc.end_struct();
656 enc.write_stop();
657 }
658 enc.end_struct();
659
660 // Field 2: key_metadata (opaque binary — contains serialized key info)
661 // We embed a mini TLV inside the binary blob for key_mode + key_material/key_id
662 // This is what parquet-mr and pyarrow do — the key_metadata field is opaque
663 {
664 std::vector<uint8_t> key_blob;
665 // Encode key_mode as first byte
666 key_blob.push_back(static_cast<uint8_t>(meta.key_mode));
667 if (meta.key_mode == KeyMode::INTERNAL && !meta.key_material.empty()) {
668 key_blob.push_back(0x01); // marker: key material follows
669 // 2-byte LE length + data
670 uint16_t klen = static_cast<uint16_t>(meta.key_material.size());
671 key_blob.push_back(static_cast<uint8_t>(klen & 0xFF));
672 key_blob.push_back(static_cast<uint8_t>((klen >> 8) & 0xFF));
673 key_blob.insert(key_blob.end(),
674 meta.key_material.begin(), meta.key_material.end());
675 }
676 if (!meta.key_id.empty()) {
677 key_blob.push_back(0x02); // marker: key_id follows
678 uint16_t idlen = static_cast<uint16_t>(meta.key_id.size());
679 key_blob.push_back(static_cast<uint8_t>(idlen & 0xFF));
680 key_blob.push_back(static_cast<uint8_t>((idlen >> 8) & 0xFF));
681 key_blob.insert(key_blob.end(), meta.key_id.begin(), meta.key_id.end());
682 }
683
685 enc.write_binary(key_blob.data(), key_blob.size());
686 }
687
688 enc.write_stop();
689 return enc.data();
690}
691
694 const uint8_t* data, size_t size) {
695
698 "Thrift key metadata exceeds 1 MB limit"};
699 }
700
701 thrift::CompactDecoder dec(data, size);
703 bool found_key_metadata = false;
704
705 while (true) {
706 auto fh = dec.read_field_header();
707 if (fh.is_stop()) break;
708
709 switch (fh.field_id) {
710 case 1: {
711 // EncryptionAlgorithm union — skip it (we don't need it for key metadata)
712 dec.skip_field(fh.thrift_type);
713 break;
714 }
715 case 2: {
716 // key_metadata binary blob
717 auto blob = dec.read_binary();
718 if (blob.empty()) break;
719 found_key_metadata = true;
720
721 size_t off = 0;
722 if (off < blob.size()) {
723 meta.key_mode = static_cast<KeyMode>(blob[off++]);
724 }
725 while (off < blob.size()) {
726 uint8_t marker = blob[off++];
727 if (marker == 0x01 && off + 2 <= blob.size()) {
728 // key material
729 uint16_t klen = static_cast<uint16_t>(blob[off])
730 | (static_cast<uint16_t>(blob[off + 1]) << 8);
731 off += 2;
732 if (off + klen <= blob.size()) {
733 meta.key_material.assign(blob.begin() + static_cast<ptrdiff_t>(off),
734 blob.begin() + static_cast<ptrdiff_t>(off + klen));
735 off += klen;
736 }
737 } else if (marker == 0x02 && off + 2 <= blob.size()) {
738 // key_id
739 uint16_t idlen = static_cast<uint16_t>(blob[off])
740 | (static_cast<uint16_t>(blob[off + 1]) << 8);
741 off += 2;
742 if (off + idlen <= blob.size()) {
743 meta.key_id = std::string(
744 reinterpret_cast<const char*>(blob.data() + off), idlen);
745 off += idlen;
746 }
747 } else {
748 break; // unknown marker, stop
749 }
750 }
751 break;
752 }
753 default:
754 dec.skip_field(fh.thrift_type);
755 break;
756 }
757 }
758
759 if (!found_key_metadata) {
761 "Thrift ColumnCryptoMetaData: missing key_metadata field"};
762 }
763
764 return meta;
765}
766
772inline std::vector<uint8_t> serialize_file_properties(
773 const FileEncryptionProperties& props) {
774
776
777 // Field 1: EncryptionAlgorithm union
779 enc.begin_struct();
780 {
781 int16_t union_field = (props.algorithm == EncryptionAlgorithm::AES_GCM_V1) ? 1 : 2;
783 enc.begin_struct();
784 {
785 if (!props.aad_prefix.empty()) {
787 enc.write_binary(
788 reinterpret_cast<const uint8_t*>(props.aad_prefix.data()),
789 props.aad_prefix.size());
790 }
791 // field 3: supply_aad_prefix — true if aad_prefix is set
792 if (!props.aad_prefix.empty()) {
793 enc.write_field_bool(3, true);
794 }
795 enc.write_stop();
796 }
797 enc.end_struct();
798 enc.write_stop();
799 }
800 enc.end_struct();
801
802 // Field 3 (Signet extension): footer_encrypted flag
804
805 enc.write_stop();
806 return enc.data();
807}
808
811 const uint8_t* data, size_t size) {
812
815 "Thrift file properties exceeds 1 MB limit"};
816 }
817
818 thrift::CompactDecoder dec(data, size);
820 bool found_algo = false;
821
822 while (true) {
823 auto fh = dec.read_field_header();
824 if (fh.is_stop()) break;
825
826 switch (fh.field_id) {
827 case 1: {
828 // EncryptionAlgorithm union — serialized as nested struct
829 dec.begin_struct();
830 auto inner_fh = dec.read_field_header();
831 if (!inner_fh.is_stop()) {
832 if (inner_fh.field_id == 1) {
834 } else {
836 }
837 found_algo = true;
838
839 // Read inner AesGcmV1/AesGcmCtrV1 struct
840 dec.begin_struct();
841 while (true) {
842 auto aes_fh = dec.read_field_header();
843 if (aes_fh.is_stop()) break;
844 if (aes_fh.field_id == 1 &&
845 aes_fh.thrift_type == thrift::compact_type::BINARY) {
846 auto prefix_bin = dec.read_binary();
847 props.aad_prefix = std::string(
848 reinterpret_cast<const char*>(prefix_bin.data()),
849 prefix_bin.size());
850 } else {
851 dec.skip_field(aes_fh.thrift_type);
852 }
853 }
854 dec.end_struct();
855
856 // Read remaining union fields until stop
857 while (true) {
858 auto ufh = dec.read_field_header();
859 if (ufh.is_stop()) break;
860 dec.skip_field(ufh.thrift_type);
861 }
862 }
863 dec.end_struct();
864 break;
865 }
866 case 3: {
867 // footer_encrypted bool (Signet extension)
868 // Bool fields have value encoded in type nibble
869 props.footer_encrypted =
870 (fh.thrift_type == thrift::compact_type::BOOL_TRUE);
871 break;
872 }
873 default:
874 dec.skip_field(fh.thrift_type);
875 break;
876 }
877 }
878
879 if (!found_algo) {
881 "Thrift FileCryptoMetaData: missing encryption_algorithm"};
882 }
883
884 return props;
885}
886
887} // namespace detail::thrift_crypto
888
889// ===========================================================================
890// MetadataFormat — selects TLV (legacy) or Thrift (spec-compliant) wire format
891// ===========================================================================
892
897enum class MetadataFormat : int32_t {
898 TLV = 0,
899 THRIFT = 1,
900};
901
902// ===========================================================================
903// Algorithm deprecation framework (Gap C-4)
904//
905// NIST SP 800-131A Rev. 2 — Transitioning the Use of Cryptographic
906// Algorithms and Key Lengths.
907//
908// Tracks algorithm lifecycle status to support deprecation planning.
909//
910// Reference: NIST SP 800-131A Rev. 2 (March 2019)
911// ===========================================================================
912
914enum class AlgorithmStatus : int32_t {
915 ACCEPTABLE = 0,
916 DEPRECATED = 1,
917 DISALLOWED = 2,
918 LEGACY = 3,
919};
920
929
930// ===========================================================================
931// INTERNAL key mode production gate (Gap C-15)
932//
933// FIPS 140-3 §7.7 — INTERNAL key mode stores plaintext keys in file
934// metadata, which is unsuitable for production. This compile-time
935// gate prevents accidental use in production builds.
936//
937// Reference: FIPS 140-3 §7.7 — Key/CSP zeroization
938// ===========================================================================
939
949#if defined(SIGNET_REQUIRE_COMMERCIAL_LICENSE) && SIGNET_REQUIRE_COMMERCIAL_LICENSE
950 if (mode == KeyMode::INTERNAL) {
952 "INTERNAL key mode stores plaintext keys in file metadata — "
953 "not allowed in production builds (FIPS 140-3 §7.7). "
954 "Use EXTERNAL key mode with a KMS client."};
955 }
956#else
957 (void)mode;
958#endif
959 return {};
960}
961
962// ===========================================================================
963// Key rotation API (Gap T-7)
964//
965// PCI-DSS, HIPAA, SOX, and DORA Art. 9(2) require cryptographic key
966// rotation with documented procedures. This API provides the mechanism
967// to rotate keys and re-encrypt files.
968//
969// Reference: NIST SP 800-57 Part 1 Rev. 5 §5.3.5 (key transition)
970// PCI-DSS v4.0 Req. 3.6.4 (cryptographic key rotation)
971// ===========================================================================
972
975 std::string key_id;
976 std::vector<uint8_t> old_key;
977 std::vector<uint8_t> new_key;
978 std::string reason;
979 int64_t requested_ns = 0;
980};
981
984 bool success = false;
985 std::string key_id;
986 int64_t completed_ns = 0;
987 int64_t files_re_encrypted = 0;
988 std::string error_message;
989};
990
991// ===========================================================================
992// CryptoShredder — GDPR right-to-erasure via per-subject key destruction
993// (Gap G-1)
994//
995// Implements GDPR Art. 17 "right to be forgotten" via cryptographic erasure:
996// each data subject's records are encrypted with a unique per-subject DEK.
997// To "erase" a subject's data, the subject's DEK is destroyed, rendering
998// all their encrypted records permanently unreadable without needing to
999// locate and delete every copy of the data.
1000//
1001// This approach is recognized by EDPB (European Data Protection Board)
1002// Guidelines 8/2020 as a valid erasure method when "deletion of personal
1003// data is not feasible" (e.g., in immutable data stores, backups, or
1004// distributed systems).
1005//
1006// Usage:
1007// CryptoShredder shredder;
1008// auto result = shredder.register_subject("user-42", subject_dek);
1009// // ... write encrypted data using subject_dek ...
1010// shredder.shred("user-42"); // Destroys the DEK → data is unreadable
1011//
1012// References:
1013// - GDPR Art. 17 — Right to erasure
1014// - EDPB Guidelines 8/2020 — Technical measures for erasure
1015// - NIST SP 800-88 Rev. 1 §2.4 — Cryptographic Erase (CE)
1016// ===========================================================================
1017
1022public:
1028 const std::string& subject_id,
1029 const std::vector<uint8_t>& dek) {
1030
1031 if (subject_id.empty()) {
1033 "CryptoShredder: subject_id must not be empty"};
1034 }
1035 if (keys_.count(subject_id) > 0) {
1037 "CryptoShredder: subject '" + subject_id + "' already registered"};
1038 }
1039 keys_[subject_id] = dek;
1040 return {};
1041 }
1042
1047 const std::string& subject_id) const {
1048
1049 auto it = keys_.find(subject_id);
1050 if (it == keys_.end()) {
1051 // Check if this subject was previously shredded
1052 if (shredded_.count(subject_id) > 0) {
1054 "CryptoShredder: subject '" + subject_id +
1055 "' has been cryptographically erased (GDPR Art. 17)"};
1056 }
1058 "CryptoShredder: subject '" + subject_id + "' not found"};
1059 }
1060 return &it->second;
1061 }
1062
1070 [[nodiscard]] expected<void> shred(const std::string& subject_id) {
1071 auto it = keys_.find(subject_id);
1072 if (it == keys_.end()) {
1074 "CryptoShredder: subject '" + subject_id + "' not found"};
1075 }
1076
1077 // Securely zero the key material before erasing
1078 volatile unsigned char* p =
1079 reinterpret_cast<volatile unsigned char*>(it->second.data());
1080 for (size_t i = 0; i < it->second.size(); ++i) p[i] = 0;
1081
1082 keys_.erase(it);
1083 shredded_.insert(subject_id);
1084 return {};
1085 }
1086
1088 [[nodiscard]] bool is_shredded(const std::string& subject_id) const {
1089 return shredded_.count(subject_id) > 0;
1090 }
1091
1093 [[nodiscard]] size_t active_count() const { return keys_.size(); }
1094
1096 [[nodiscard]] size_t shredded_count() const { return shredded_.size(); }
1097
1098private:
1099 std::unordered_map<std::string, std::vector<uint8_t>> keys_;
1100 std::unordered_set<std::string> shredded_;
1101};
1102
1103} // namespace signet::forge::crypto
Per-subject key store supporting cryptographic erasure.
expected< const std::vector< uint8_t > * > get_key(const std::string &subject_id) const
Retrieve a subject's DEK for encryption/decryption.
bool is_shredded(const std::string &subject_id) const
Check if a subject has been cryptographically erased.
size_t shredded_count() const
Number of shredded subjects.
expected< void > shred(const std::string &subject_id)
Cryptographically shred a subject's data by destroying their DEK.
size_t active_count() const
Number of active (non-shredded) subjects.
expected< void > register_subject(const std::string &subject_id, const std::vector< uint8_t > &dek)
Register a data subject's DEK.
Abstract KMS client interface for DEK/KEK key wrapping.
virtual expected< std::vector< uint8_t > > unwrap_key(const std::vector< uint8_t > &wrapped_dek, const std::string &master_key_id) const =0
Unwrap (decrypt) a wrapped DEK using the KEK identified by master_key_id.
virtual expected< std::vector< uint8_t > > wrap_key(const std::vector< uint8_t > &dek, const std::string &master_key_id) const =0
Wrap (encrypt) a DEK under the KEK identified by master_key_id.
A lightweight result type that holds either a success value of type T or an Error.
Definition error.hpp:145
Thrift Compact Protocol reader.
Definition compact.hpp:267
void begin_struct()
Push a new field-ID context for reading a nested struct.
Definition compact.hpp:508
void end_struct()
Pop the field-ID context after finishing a nested struct.
Definition compact.hpp:515
FieldHeader read_field_header()
Read a field header.
Definition compact.hpp:285
void skip_field(uint8_t thrift_type)
Skip a field without parsing its value.
Definition compact.hpp:427
std::vector< uint8_t > read_binary()
Read raw binary data (varint-length-prefixed bytes).
Definition compact.hpp:390
Thrift Compact Protocol writer.
Definition compact.hpp:72
void begin_struct()
Push a new field-ID context for a nested struct.
Definition compact.hpp:100
void end_struct()
Pop the field-ID context after finishing a nested struct.
Definition compact.hpp:103
void write_field_bool(int16_t field_id, bool val)
Write a bool field where the value is embedded in the field header's type nibble (1 = true,...
Definition compact.hpp:120
const std::vector< uint8_t > & data() const
Returns a const reference to the underlying byte buffer.
Definition compact.hpp:200
void write_field(int16_t field_id, uint8_t thrift_type)
Write a field header.
Definition compact.hpp:85
void write_stop()
Write struct stop marker (0x00).
Definition compact.hpp:97
void write_binary(const uint8_t *data, size_t len)
Write raw binary data as varint-length-prefixed bytes.
Definition compact.hpp:174
Thrift Compact Protocol encoder and decoder for Parquet metadata serialization.
void write_le32(uint8_t *dst, uint32_t val)
Write a 4-byte little-endian uint32 to dst.
constexpr uint32_t TAG_KEY_MATERIAL
Tag: raw key material (variable blob).
constexpr uint32_t TAG_KEY_ID
Tag: KMS key identifier (variable UTF-8).
bool tlv_to_i32(const TlvField &field, int32_t &out)
Read an int32 from a TLV field's data (must be exactly 4 bytes).
void append_tlv_blob(std::vector< uint8_t > &buf, uint32_t tag, const std::vector< uint8_t > &blob)
Append a TLV field containing a blob.
constexpr size_t MAX_METADATA_SIZE
Maximum total metadata size (1 MB cap to prevent memory exhaustion from crafted payloads,...
void append_tlv_str(std::vector< uint8_t > &buf, uint32_t tag, const std::string &s)
Append a TLV field containing a string.
std::string tlv_to_str(const TlvField &field)
Read a string from a TLV field's data.
std::vector< uint8_t > tlv_to_blob(const TlvField &field)
Read a blob from a TLV field's data.
void append_tlv_u8(std::vector< uint8_t > &buf, uint32_t tag, uint8_t val)
Append a TLV field containing a single byte.
bool tlv_to_u8(const TlvField &field, uint8_t &out)
Read a uint8 from a TLV field's data (must be exactly 1 byte).
void append_tlv_i32(std::vector< uint8_t > &buf, uint32_t tag, int32_t val)
Append a TLV field containing a single int32_t (little-endian).
constexpr uint32_t TAG_ALGORITHM
Tag: encryption algorithm (4 bytes, int32_t LE).
constexpr uint32_t MAX_TLV_LENGTH
Maximum TLV field length (64 MB cap to prevent memory exhaustion from malformed data).
void append_tlv(std::vector< uint8_t > &buf, uint32_t tag, const uint8_t *data, uint32_t len)
Append a TLV field: [4-byte LE tag] [4-byte LE length] [data].
constexpr uint32_t TAG_KEY_MODE
Tag: key mode (4 bytes, int32_t LE).
constexpr uint32_t TAG_AAD_PREFIX
Tag: AAD prefix string (variable UTF-8).
bool read_tlv(const uint8_t *buf, size_t buf_size, size_t &offset, TlvField &field)
Parse the next TLV field from a buffer.
constexpr uint32_t TAG_FOOTER_ENCRYPTED
Tag: footer-encrypted flag (1 byte, 0/1).
uint32_t read_le32(const uint8_t *src)
Read a 4-byte little-endian uint32 from src.
std::vector< uint8_t > serialize_file_properties(const FileEncryptionProperties &props)
Serialize FileEncryptionProperties to Thrift Compact Protocol.
expected< EncryptionKeyMetadata > deserialize_key_metadata(const uint8_t *data, size_t size)
Deserialize EncryptionKeyMetadata from Thrift Compact Protocol.
expected< FileEncryptionProperties > deserialize_file_properties(const uint8_t *data, size_t size)
Deserialize FileEncryptionProperties from Thrift Compact Protocol.
std::vector< uint8_t > serialize_key_metadata(const EncryptionKeyMetadata &meta, EncryptionAlgorithm algo=EncryptionAlgorithm::AES_GCM_CTR_V1, const std::string &aad_prefix="")
Serialize EncryptionKeyMetadata to Thrift Compact Protocol.
KeyMode
How the encryption key is stored or referenced.
@ INTERNAL
Key material stored directly in file metadata (testing/dev).
@ EXTERNAL
Key referenced by KMS key ID; actual key resolved from KMS at runtime.
expected< void > validate_key_mode_for_production(KeyMode mode)
Check if INTERNAL key mode is allowed in the current build.
MetadataFormat
Wire format for key metadata serialization.
@ TLV
Signet v1 custom TLV format.
@ THRIFT
Parquet Thrift Compact Protocol (spec-compliant)
AlgorithmStatus
Algorithm lifecycle status per NIST SP 800-131A.
@ LEGACY
Only for processing existing data (no new encryption).
@ DEPRECATED
Still allowed but scheduled for removal.
EncryptionAlgorithm
Encryption algorithm identifier.
@ AES_GCM_CTR_V1
AES-256-GCM for footer, AES-256-CTR for column data (Parquet default).
@ AES_GCM_V1
AES-256-GCM for both footer and column data.
constexpr uint8_t STRUCT
Nested struct.
Definition compact.hpp:39
constexpr uint8_t BINARY
Length-prefixed bytes (also used for STRING).
Definition compact.hpp:35
constexpr uint8_t BOOL_TRUE
Boolean true (embedded in field header).
Definition compact.hpp:28
@ ENCRYPTION_ERROR
An encryption or decryption operation failed (bad key, tampered ciphertext, PME error).
@ INVALID_ARGUMENT
A caller-supplied argument is outside the valid range or violates a precondition.
Lightweight error value carrying an ErrorCode and a human-readable message.
Definition error.hpp:101
Algorithm deprecation entry.
std::string transition_guidance
Migration guidance.
int32_t min_key_bits
Minimum key length in bits.
std::string algorithm_name
E.g. "AES-256-GCM", "AES-128-CTR", "3DES".
int64_t sunset_ns
Planned deprecation timestamp (0 = no sunset).
Specifies the encryption key for a single Parquet column.
std::string column_name
Parquet column path (e.g. "a.b.c").
std::string key_id
KMS key identifier (EXTERNAL mode).
std::vector< uint8_t > key
32-byte AES-256 key (INTERNAL mode).
Top-level configuration structure that drives FileEncryptor / FileDecryptor.
std::vector< uint8_t > default_column_key
Default column key (32 bytes).
std::string default_column_key_id
KMS key identifier for the default column key (EXTERNAL mode).
std::shared_ptr< IKmsClient > kms_client
Optional KMS client for DEK/KEK key wrapping (EXTERNAL key mode).
bool encrypt_footer
If true, the footer is encrypted.
std::vector< uint8_t > footer_key
32-byte AES-256 key for encrypting the Parquet footer (FileMetaData).
KeyMode key_mode
INTERNAL: keys stored in file metadata. EXTERNAL: KMS references only.
@ SPEC_BINARY
Parquet PME spec: fixed-width binary AAD.
@ LEGACY
Signet v1: null-separated string AAD.
std::string aad_prefix
AAD prefix – typically a file identifier or URI.
EncryptionAlgorithm algorithm
Encryption algorithm (GCM everywhere, or GCM-footer + CTR-columns).
std::vector< ColumnKeySpec > column_keys
Per-column key specifications. Columns listed here get their own key.
std::string footer_key_id
KMS key identifier for the footer key (EXTERNAL mode).
Per-key metadata stored alongside encrypted Parquet components.
static expected< EncryptionKeyMetadata > deserialize(const uint8_t *data, size_t size)
Deserialize from bytes.
std::vector< uint8_t > key_material
Raw AES key bytes (INTERNAL mode only).
std::string key_id
KMS key reference (EXTERNAL mode).
std::vector< uint8_t > serialize() const
Serialize to bytes using TLV format.
KeyMode key_mode
INTERNAL or EXTERNAL key mode.
Stored in the Parquet FileMetaData.encryption_algorithm field.
std::string aad_prefix
AAD prefix bound into GCM auth tags.
EncryptionAlgorithm algorithm
Encryption algorithm.
static expected< FileEncryptionProperties > deserialize(const uint8_t *data, size_t size)
Deserialize from bytes.
std::vector< uint8_t > serialize() const
Serialize to bytes using TLV format.
bool footer_encrypted
Whether the footer itself is encrypted.
Key rotation request describing old → new key transition.
std::vector< uint8_t > new_key
Replacement key.
std::vector< uint8_t > old_key
Current (old) key.
std::string reason
Rotation reason: "scheduled", "compromised", "policy".
std::string key_id
Key being rotated.
int64_t requested_ns
Rotation request timestamp.
int64_t files_re_encrypted
Number of files re-encrypted.
int64_t completed_ns
Completion timestamp.
std::string error_message
Error message (if not successful).
bool success
Whether the rotation completed.
Parsed TLV (tag-length-value) field from serialized metadata.
uint32_t length
Length of the field data in bytes.
const uint8_t * data
Pointer to the field data within the source buffer.
uint32_t tag
4-byte tag identifying the field type.