Signet Forge 0.1.0
C++20 Parquet library with AI-native extensions
DEMO
Loading...
Searching...
No Matches
inference_log.hpp
Go to the documentation of this file.
1// SPDX-License-Identifier: AGPL-3.0-or-later
2// Copyright 2026 Johnson Ogundeji
3// See LICENSE_COMMERCIAL for full terms.
4#pragma once
5
6#if !defined(SIGNET_ENABLE_COMMERCIAL) || !SIGNET_ENABLE_COMMERCIAL
7#error "signet/ai/inference_log.hpp requires SIGNET_ENABLE_COMMERCIAL=ON (AGPL-3.0 commercial tier). See LICENSE_COMMERCIAL."
8#endif
9
10// ---------------------------------------------------------------------------
11// inference_log.hpp -- LLM/ML Inference Audit Log
12//
13// Logs model inference operations with cryptographic hash chaining for
14// regulatory compliance and operational auditing. Captures inference
15// metadata (latency, batch size, token counts) without storing raw
16// inputs/outputs (only their SHA-256 hashes for privacy).
17//
18// Regulatory requirements addressed:
19// - EU AI Act Art 12/19: Automatic logging of AI system operations
20// - GDPR Art 35: Data protection impact — input/output hashing
21// - SEC 17a-4: Tamper-evident records retention
22//
23// Header-only. Part of the signet::forge AI module.
24// ---------------------------------------------------------------------------
25
28#include "signet/error.hpp"
29#include "signet/schema.hpp"
30#include "signet/types.hpp"
31#include "signet/writer.hpp"
32#include "signet/reader.hpp"
33
34#include <algorithm>
35#include <cctype>
36#include <cstdint>
37#include <cstring>
38#include <filesystem>
39#include <stdexcept>
40#include <string>
41#include <vector>
42
43namespace signet::forge {
44
47enum class InferenceType : int32_t {
48 CLASSIFICATION = 0,
49 REGRESSION = 1,
50 EMBEDDING = 2,
51 GENERATION = 3,
52 RANKING = 4,
53 ANOMALY = 5,
54 RECOMMENDATION = 6,
55 CUSTOM = 255
56};
57
65 int64_t timestamp_ns{0};
66 std::string model_id;
67 std::string model_version;
69 std::vector<float> input_embedding;
70 std::string input_hash;
71 std::string output_hash;
72 float output_score{0.0f};
73 int64_t latency_ns{0};
74 int32_t batch_size{1};
75 int32_t input_tokens{0};
76 int32_t output_tokens{0};
77 std::string user_id_hash;
78 std::string session_id;
79 std::string metadata_json;
80
81 // EU AI Act Art.13(3)(b)(ii): training data provenance for high-risk AI
82 // transparency — deployers must be able to identify the data used to train
83 // the model that produced each inference.
84 std::string training_dataset_id;
87
88 // EU AI Act Art.12/13: additional training metadata (M-18)
92
97 [[nodiscard]] inline std::vector<uint8_t> serialize() const {
98 std::vector<uint8_t> buf;
99 buf.reserve(256);
100
101 // timestamp_ns: 8 bytes LE
102 append_le64(buf, static_cast<uint64_t>(timestamp_ns));
103
104 // model_id: string
105 append_string(buf, model_id);
106
107 // model_version: string
108 append_string(buf, model_version);
109
110 // inference_type: 4 bytes LE
111 append_le32(buf, static_cast<uint32_t>(inference_type));
112
113 // input_embedding: vector of floats
114 append_le32(buf, static_cast<uint32_t>(input_embedding.size()));
115 for (float f : input_embedding) {
116 append_float(buf, f);
117 }
118
119 // input_hash: string
120 append_string(buf, input_hash);
121
122 // output_hash: string
123 append_string(buf, output_hash);
124
125 // output_score: float (4 bytes)
126 append_float(buf, output_score);
127
128 // latency_ns: 8 bytes LE
129 append_le64(buf, static_cast<uint64_t>(latency_ns));
130
131 // batch_size: 4 bytes LE
132 append_le32(buf, static_cast<uint32_t>(batch_size));
133
134 // input_tokens: 4 bytes LE
135 append_le32(buf, static_cast<uint32_t>(input_tokens));
136
137 // output_tokens: 4 bytes LE
138 append_le32(buf, static_cast<uint32_t>(output_tokens));
139
140 // user_id_hash: string
141 append_string(buf, user_id_hash);
142
143 // session_id: string
144 append_string(buf, session_id);
145
146 // metadata_json: string
147 append_string(buf, metadata_json);
148
149 // M31: EU AI Act Art.13(3)(b)(ii) training data provenance fields
150 append_string(buf, training_dataset_id);
151 append_le64(buf, static_cast<uint64_t>(training_dataset_size));
152 append_string(buf, training_data_characteristics);
153
154 // H-19: EU AI Act Art.12/13 additional training metadata
155 append_le64(buf, static_cast<uint64_t>(model_training_end_ns));
156 append_le64(buf, static_cast<uint64_t>(model_training_data_cutoff_ns));
157 append_string(buf, model_retraining_schedule);
158
159 return buf;
160 }
161
167 [[nodiscard]] static inline expected<InferenceRecord> deserialize(
168 const uint8_t* data, size_t size) {
169 size_t offset = 0;
170 InferenceRecord rec;
171
172 if (!read_le64(data, size, offset, rec.timestamp_ns)) {
173 return Error{ErrorCode::CORRUPT_PAGE, "InferenceRecord: truncated timestamp_ns"};
174 }
175 if (!read_string(data, size, offset, rec.model_id)) {
176 return Error{ErrorCode::CORRUPT_PAGE, "InferenceRecord: truncated model_id"};
177 }
178 if (!read_string(data, size, offset, rec.model_version)) {
179 return Error{ErrorCode::CORRUPT_PAGE, "InferenceRecord: truncated model_version"};
180 }
181
182 int32_t it_val = 0;
183 if (!read_le32(data, size, offset, it_val)) {
184 return Error{ErrorCode::CORRUPT_PAGE, "InferenceRecord: truncated inference_type"};
185 }
186 rec.inference_type = static_cast<InferenceType>(it_val);
187 if (it_val < 0 || (it_val > 6 && it_val != 255))
189
190 uint32_t emb_count = 0;
191 if (!read_le32_u(data, size, offset, emb_count)) {
192 return Error{ErrorCode::CORRUPT_PAGE, "InferenceRecord: truncated embedding count"};
193 }
194 if (offset > size || emb_count > (size - offset) / sizeof(float)) {
195 return Error{ErrorCode::CORRUPT_PAGE, "InferenceRecord: embedding count exceeds remaining data"};
196 }
197 rec.input_embedding.resize(emb_count);
198 for (uint32_t i = 0; i < emb_count; ++i) {
199 if (!read_float(data, size, offset, rec.input_embedding[i])) {
200 return Error{ErrorCode::CORRUPT_PAGE, "InferenceRecord: truncated embedding data"};
201 }
202 }
203
204 if (!read_string(data, size, offset, rec.input_hash)) {
205 return Error{ErrorCode::CORRUPT_PAGE, "InferenceRecord: truncated input_hash"};
206 }
207 if (!read_string(data, size, offset, rec.output_hash)) {
208 return Error{ErrorCode::CORRUPT_PAGE, "InferenceRecord: truncated output_hash"};
209 }
210 if (!read_float(data, size, offset, rec.output_score)) {
211 return Error{ErrorCode::CORRUPT_PAGE, "InferenceRecord: truncated output_score"};
212 }
213 if (!read_le64(data, size, offset, rec.latency_ns)) {
214 return Error{ErrorCode::CORRUPT_PAGE, "InferenceRecord: truncated latency_ns"};
215 }
216 if (!read_le32(data, size, offset, rec.batch_size)) {
217 return Error{ErrorCode::CORRUPT_PAGE, "InferenceRecord: truncated batch_size"};
218 }
219 if (!read_le32(data, size, offset, rec.input_tokens)) {
220 return Error{ErrorCode::CORRUPT_PAGE, "InferenceRecord: truncated input_tokens"};
221 }
222 if (!read_le32(data, size, offset, rec.output_tokens)) {
223 return Error{ErrorCode::CORRUPT_PAGE, "InferenceRecord: truncated output_tokens"};
224 }
225 if (!read_string(data, size, offset, rec.user_id_hash)) {
226 return Error{ErrorCode::CORRUPT_PAGE, "InferenceRecord: truncated user_id_hash"};
227 }
228 if (!read_string(data, size, offset, rec.session_id)) {
229 return Error{ErrorCode::CORRUPT_PAGE, "InferenceRecord: truncated session_id"};
230 }
231 if (!read_string(data, size, offset, rec.metadata_json)) {
232 return Error{ErrorCode::CORRUPT_PAGE, "InferenceRecord: truncated metadata_json"};
233 }
234
235 // M31: EU AI Act Art.13(3)(b)(ii) training data provenance fields
236 // These fields are optional for backward compatibility with older serialized data.
237 if (offset < size) {
238 if (!read_string(data, size, offset, rec.training_dataset_id)) {
239 return Error{ErrorCode::CORRUPT_PAGE, "InferenceRecord: truncated training_dataset_id"};
240 }
241 if (!read_le64(data, size, offset, rec.training_dataset_size)) {
242 return Error{ErrorCode::CORRUPT_PAGE, "InferenceRecord: truncated training_dataset_size"};
243 }
244 if (!read_string(data, size, offset, rec.training_data_characteristics)) {
245 return Error{ErrorCode::CORRUPT_PAGE, "InferenceRecord: truncated training_data_characteristics"};
246 }
247 }
248
249 // H-19: EU AI Act Art.12/13 additional training metadata (optional for backward compat)
250 if (offset < size) {
251 if (!read_le64(data, size, offset, rec.model_training_end_ns)) {
252 return Error{ErrorCode::CORRUPT_PAGE, "InferenceRecord: truncated model_training_end_ns"};
253 }
254 if (!read_le64(data, size, offset, rec.model_training_data_cutoff_ns)) {
255 return Error{ErrorCode::CORRUPT_PAGE, "InferenceRecord: truncated model_training_data_cutoff_ns"};
256 }
257 if (!read_string(data, size, offset, rec.model_retraining_schedule)) {
258 return Error{ErrorCode::CORRUPT_PAGE, "InferenceRecord: truncated model_retraining_schedule"};
259 }
260 }
261
262 return rec;
263 }
264
265private:
266 // -- Serialization helpers -----------------------------------------------
267
268 static inline void append_le32(std::vector<uint8_t>& buf, uint32_t v) {
269 buf.push_back(static_cast<uint8_t>(v));
270 buf.push_back(static_cast<uint8_t>(v >> 8));
271 buf.push_back(static_cast<uint8_t>(v >> 16));
272 buf.push_back(static_cast<uint8_t>(v >> 24));
273 }
274
275 static inline void append_le64(std::vector<uint8_t>& buf, uint64_t v) {
276 for (int i = 0; i < 8; ++i) {
277 buf.push_back(static_cast<uint8_t>(v >> (i * 8)));
278 }
279 }
280
281 static inline void append_float(std::vector<uint8_t>& buf, float v) {
282 uint32_t bits;
283 std::memcpy(&bits, &v, 4);
284 append_le32(buf, bits);
285 }
286
287 static inline void append_string(std::vector<uint8_t>& buf, const std::string& s) {
288 // Clamp string size to UINT32_MAX to prevent truncation on cast
289 const size_t clamped = (std::min)(s.size(), static_cast<size_t>(UINT32_MAX));
290 append_le32(buf, static_cast<uint32_t>(clamped));
291 buf.insert(buf.end(), s.begin(), s.begin() + static_cast<ptrdiff_t>(clamped));
292 }
293
294 // -- Deserialization helpers -----------------------------------------------
295
296 static inline bool read_le64(const uint8_t* data, size_t size, size_t& offset, int64_t& out) {
297 if (offset + 8 > size) return false;
298 uint64_t v = 0;
299 for (int i = 0; i < 8; ++i) {
300 v |= static_cast<uint64_t>(data[offset + i]) << (i * 8);
301 }
302 out = static_cast<int64_t>(v);
303 offset += 8;
304 return true;
305 }
306
307 static inline bool read_le32(const uint8_t* data, size_t size, size_t& offset, int32_t& out) {
308 if (offset + 4 > size) return false;
309 uint32_t v = 0;
310 for (int i = 0; i < 4; ++i) {
311 v |= static_cast<uint32_t>(data[offset + i]) << (i * 8);
312 }
313 out = static_cast<int32_t>(v);
314 offset += 4;
315 return true;
316 }
317
318 static inline bool read_le32_u(const uint8_t* data, size_t size, size_t& offset, uint32_t& out) {
319 if (offset + 4 > size) return false;
320 out = 0;
321 for (int i = 0; i < 4; ++i) {
322 out |= static_cast<uint32_t>(data[offset + i]) << (i * 8);
323 }
324 offset += 4;
325 return true;
326 }
327
328 static inline bool read_float(const uint8_t* data, size_t size, size_t& offset, float& out) {
329 if (offset + 4 > size) return false;
330 uint32_t bits = 0;
331 for (int i = 0; i < 4; ++i) {
332 bits |= static_cast<uint32_t>(data[offset + i]) << (i * 8);
333 }
334 std::memcpy(&out, &bits, 4);
335 offset += 4;
336 return true;
337 }
338
339 static constexpr uint32_t MAX_STRING_LEN = 16u * 1024u * 1024u; // 16 MB
340
341 static inline bool read_string(const uint8_t* data, size_t size, size_t& offset, std::string& out) {
342 uint32_t len = 0;
343 if (!read_le32_u(data, size, offset, len)) return false;
344 if (len > MAX_STRING_LEN) return false;
345 if (offset + len > size) return false;
346 out.assign(reinterpret_cast<const char*>(data + offset), len);
347 offset += len;
348 return true;
349 }
350};
351
352namespace detail {
353
361inline std::string embedding_to_json(const std::vector<float>& embedding) {
362 if (embedding.empty()) return "[]";
363
364 std::string result = "[";
365 for (size_t i = 0; i < embedding.size(); ++i) {
366 if (i > 0) result += ',';
367 char buf[32];
368 std::snprintf(buf, sizeof(buf), "%.8g", static_cast<double>(embedding[i]));
369 result += buf;
370 }
371 result += ']';
372 return result;
373}
374
381inline std::vector<float> json_to_embedding(const std::string& json) {
382 // L25: limit parsed elements to prevent memory exhaustion from crafted input
383 static constexpr size_t MAX_JSON_ARRAY_ELEMENTS = 1'000'000;
384 std::vector<float> result;
385 if (json.size() < 2 || json.front() != '[' || json.back() != ']') {
386 return result;
387 }
388
389 size_t pos = 1;
390 size_t end = json.size() - 1;
391
392 while (pos < end) {
393 if (result.size() >= MAX_JSON_ARRAY_ELEMENTS) break;
394 while (pos < end && (json[pos] == ' ' || json[pos] == '\t')) ++pos;
395 if (pos >= end) break;
396
397 size_t num_start = pos;
398 while (pos < end && json[pos] != ',') ++pos;
399
400 std::string num_str = json.substr(num_start, pos - num_start);
401 while (!num_str.empty() && (num_str.back() == ' ' || num_str.back() == '\t')) {
402 num_str.pop_back();
403 }
404 if (!num_str.empty()) {
405 try {
406 result.push_back(std::stof(num_str));
407 } catch (...) {
408 // Skip unparseable values
409 }
410 }
411
412 if (pos < end && json[pos] == ',') ++pos;
413 }
414
415 return result;
416}
417
418} // namespace detail
419
441[[nodiscard]] inline Schema inference_log_schema() {
442 return Schema::builder("inference_log")
443 .column<int64_t>("timestamp_ns", LogicalType::TIMESTAMP_NS)
444 .column<std::string>("model_id")
445 .column<std::string>("model_version")
446 .column<int32_t>("inference_type")
447 .column<std::string>("input_embedding") // JSON array
448 .column<std::string>("input_hash")
449 .column<std::string>("output_hash")
450 .column<double>("output_score")
451 .column<int64_t>("latency_ns")
452 .column<int32_t>("batch_size")
453 .column<int32_t>("input_tokens")
454 .column<int32_t>("output_tokens")
455 .column<std::string>("user_id_hash")
456 .column<std::string>("session_id")
457 .column<std::string>("metadata_json")
458 .column<int64_t>("chain_seq")
459 .column<std::string>("chain_hash")
460 .column<std::string>("prev_hash")
461 .column<int64_t>("row_id")
462 .column<int32_t>("row_version")
463 .column<std::string>("row_origin_file")
464 .column<std::string>("row_prev_hash")
465 .build();
466}
467
487public:
492 inline InferenceLogWriter(const std::string& output_dir,
493 const std::string& chain_id = "",
494 size_t max_records = 100000)
495 : output_dir_(output_dir)
496 , chain_id_(chain_id.empty() ? generate_chain_id() : chain_id)
497 , max_records_(max_records)
498 , schema_(inference_log_schema())
499 , lineage_tracker_(chain_id.empty() ? chain_id_ : chain_id, 1)
500 {
501 auto license = commercial::require_feature("InferenceLogWriter");
502 if (!license) {
503 throw std::runtime_error(license.error().message);
504 }
505
506 // Keep constructor hardening in parity with DecisionLogWriter.
507 if (output_dir_.empty()) {
508 throw std::invalid_argument("InferenceLogWriter: output_dir must not be empty");
509 }
510 for (size_t s = 0, e; s <= output_dir_.size(); s = e + 1) {
511 e = output_dir_.find_first_of("/\\", s);
512 if (e == std::string::npos) e = output_dir_.size();
513 if (output_dir_.substr(s, e - s) == "..") {
514 throw std::invalid_argument(
515 "InferenceLogWriter: output_dir must not contain '..' path traversal");
516 }
517 }
518 for (char c : chain_id_) {
519 if (!std::isalnum(static_cast<unsigned char>(c)) && c != '_' && c != '-') {
520 throw std::invalid_argument(
521 "InferenceLogWriter: chain_id must only contain [a-zA-Z0-9_-]");
522 }
523 }
524 }
525
531 [[nodiscard]] inline expected<HashChainEntry> log(const InferenceRecord& record) {
532 auto usage = commercial::record_usage_rows("InferenceLogWriter::log", 1);
533 if (!usage) return usage.error();
534
535 // Serialize the record for hashing
536 auto data = record.serialize();
537
538 // Use the record's timestamp, or current time if zero
539 int64_t ts = record.timestamp_ns;
540 if (ts == 0) {
541 ts = now_ns();
542 }
543
544 // Append to hash chain
545 auto entry = chain_.append(data.data(), data.size(), ts);
546
547 // Store the record, chain entry, and serialized data for lineage
548 pending_records_.push_back(record);
549 pending_entries_.push_back(entry);
550 pending_data_.push_back(std::move(data));
551
552 // Auto-flush if we've reached the rotation threshold
553 if (pending_records_.size() >= max_records_) {
554 auto result = flush();
555 if (!result) return result.error();
556 }
557
558 ++total_records_;
559 return entry;
560 }
561
567 [[nodiscard]] inline expected<void> flush() {
568 if (pending_records_.empty()) {
569 return expected<void>{};
570 }
571
572 // Build file path: inference_log_{chain_id}_{start_seq}_{end_seq}.parquet
573 int64_t start_seq = pending_entries_.front().sequence_number;
574 int64_t end_seq = pending_entries_.back().sequence_number;
575
576 current_file_path_ = output_dir_ + "/inference_log_" + chain_id_ + "_"
577 + std::to_string(start_seq) + "_"
578 + std::to_string(end_seq) + ".parquet";
579
580 // Prepare writer options with audit metadata
581 WriterOptions opts;
582 opts.created_by = "SignetStack signet-forge inference_log v1.0";
583
584 // Embed audit chain metadata
585 auto meta = current_metadata();
586 auto meta_kvs = meta.to_key_values();
587 for (auto& [k, v] : meta_kvs) {
588 opts.file_metadata.push_back(thrift::KeyValue(std::move(k), std::move(v)));
589 }
590
591 // Open the Parquet file
592 auto writer_result = ParquetWriter::open(current_file_path_, schema_, opts);
593 if (!writer_result) return writer_result.error();
594 auto& writer = *writer_result;
595
596 // Write each record as a row
597 size_t n = pending_records_.size();
598 for (size_t i = 0; i < n; ++i) {
599 const auto& rec = pending_records_[i];
600 const auto& entry = pending_entries_[i];
601
602 // Compute per-row lineage from serialized data
603 const auto& row_data = pending_data_[i];
604 auto lineage = lineage_tracker_.next(row_data.data(), row_data.size());
605
606 std::vector<std::string> row;
607 row.reserve(22);
608
609 row.push_back(std::to_string(rec.timestamp_ns));
610 row.push_back(rec.model_id);
611 row.push_back(rec.model_version);
612 row.push_back(std::to_string(static_cast<int32_t>(rec.inference_type)));
613 row.push_back(detail::embedding_to_json(rec.input_embedding));
614 row.push_back(rec.input_hash);
615 row.push_back(rec.output_hash);
616 row.push_back(double_to_string(static_cast<double>(rec.output_score)));
617 row.push_back(std::to_string(rec.latency_ns));
618 row.push_back(std::to_string(rec.batch_size));
619 row.push_back(std::to_string(rec.input_tokens));
620 row.push_back(std::to_string(rec.output_tokens));
621 row.push_back(rec.user_id_hash);
622 row.push_back(rec.session_id);
623 row.push_back(rec.metadata_json);
624 row.push_back(std::to_string(entry.sequence_number));
625 row.push_back(hash_to_hex(entry.entry_hash));
626 row.push_back(hash_to_hex(entry.prev_hash));
627 row.push_back(std::to_string(lineage.row_id));
628 row.push_back(std::to_string(lineage.row_version));
629 row.push_back(lineage.row_origin_file);
630 row.push_back(lineage.row_prev_hash);
631
632 auto write_result = writer.write_row(row);
633 if (!write_result) return write_result.error();
634 }
635
636 // Close the file (flushes internal buffers and writes footer)
637 auto close_result = writer.close();
638 if (!close_result) return close_result.error();
639
640 // Clear pending buffers
641 pending_records_.clear();
642 pending_entries_.clear();
643 pending_data_.clear();
644 ++file_count_;
645
646 return expected<void>{};
647 }
648
650 [[nodiscard]] inline expected<void> close() {
651 if (!pending_records_.empty()) {
652 return flush();
653 }
654 return expected<void>{};
655 }
656
661 [[nodiscard]] inline AuditMetadata current_metadata() const {
662 AuditMetadata meta;
663 meta.chain_id = chain_id_;
664
665 if (!pending_entries_.empty()) {
666 meta.start_sequence = pending_entries_.front().sequence_number;
667 meta.end_sequence = pending_entries_.back().sequence_number;
668 meta.first_hash = hash_to_hex(pending_entries_.front().entry_hash);
669 meta.last_hash = hash_to_hex(pending_entries_.back().entry_hash);
670 meta.prev_file_hash = hash_to_hex(pending_entries_.front().prev_hash);
671 } else if (!chain_.entries().empty()) {
672 const auto& last = chain_.entries().back();
673 meta.start_sequence = last.sequence_number;
674 meta.end_sequence = last.sequence_number;
675 meta.first_hash = hash_to_hex(last.entry_hash);
676 meta.last_hash = hash_to_hex(last.entry_hash);
677 }
678
679 meta.record_count = static_cast<int64_t>(pending_entries_.size());
680 meta.record_type = "inference";
681 return meta;
682 }
683
685 [[nodiscard]] inline size_t pending_records() const {
686 return pending_records_.size();
687 }
688
690 [[nodiscard]] inline int64_t total_records() const {
691 return total_records_;
692 }
693
695 [[nodiscard]] inline std::string current_file_path() const {
696 return current_file_path_;
697 }
698
699private:
700 std::string output_dir_;
701 std::string chain_id_;
702 size_t max_records_;
703 Schema schema_;
704 AuditChainWriter chain_;
705 std::vector<InferenceRecord> pending_records_;
706 std::vector<HashChainEntry> pending_entries_;
707 std::vector<std::vector<uint8_t>> pending_data_; // Serialized record bytes for lineage hashing
708 RowLineageTracker lineage_tracker_;
709 std::string current_file_path_;
710 int64_t total_records_{0};
711 int64_t file_count_{0};
712
714 static inline std::string double_to_string(double v) {
715 char buf[32];
716 std::snprintf(buf, sizeof(buf), "%.17g", v);
717 return buf;
718 }
719};
720
729public:
734 [[nodiscard]] static inline expected<InferenceLogReader> open(const std::string& path) {
735 auto license = commercial::require_feature("InferenceLogReader");
736 if (!license) return license.error();
737
738 auto reader_result = ParquetReader::open(path);
739 if (!reader_result) return reader_result.error();
740
742 ilr.reader_ = std::make_unique<ParquetReader>(std::move(*reader_result));
743 ilr.path_ = path;
744
745 // Pre-read all column data from all row groups
746 auto load_result = ilr.load_columns();
747 if (!load_result) return load_result.error();
748
749 return ilr;
750 }
751
760
764 [[nodiscard]] inline expected<std::vector<InferenceRecord>> read_all() const {
765 auto validation = validate_loaded_columns();
766 if (!validation) return validation.error();
767
768 size_t n = col_timestamp_ns_.size();
769 std::vector<InferenceRecord> records;
770 records.reserve(n);
771
772 for (size_t i = 0; i < n; ++i) {
773 InferenceRecord rec;
774 rec.timestamp_ns = col_timestamp_ns_[i];
775 rec.model_id = col_model_id_[i];
776 rec.model_version = col_model_version_[i];
777 rec.inference_type = static_cast<InferenceType>(col_inference_type_[i]);
778 if (col_inference_type_[i] < 0 || (col_inference_type_[i] > 6 && col_inference_type_[i] != 255))
780 rec.input_embedding = detail::json_to_embedding(col_input_embedding_[i]);
781 rec.input_hash = col_input_hash_[i];
782 rec.output_hash = col_output_hash_[i];
783 rec.output_score = static_cast<float>(col_output_score_[i]);
784 rec.latency_ns = col_latency_ns_[i];
785 rec.batch_size = col_batch_size_[i];
786 rec.input_tokens = col_input_tokens_[i];
787 rec.output_tokens = col_output_tokens_[i];
788 rec.user_id_hash = col_user_id_hash_[i];
789 rec.session_id = col_session_id_[i];
790 rec.metadata_json = col_metadata_json_[i];
791 records.push_back(std::move(rec));
792 }
793
794 return records;
795 }
796
800 [[nodiscard]] inline expected<AuditMetadata> audit_metadata() const {
801 const auto& kvs = reader_->key_value_metadata();
802 AuditMetadata meta;
803
804 for (const auto& kv : kvs) {
805 if (!kv.value.has_value()) continue;
806 const auto& val = *kv.value;
807
808 if (kv.key == "signetstack.audit.chain_id") meta.chain_id = val;
809 else if (kv.key == "signetstack.audit.first_seq") { try { meta.start_sequence = std::stoll(val); } catch (...) {} }
810 else if (kv.key == "signetstack.audit.last_seq") { try { meta.end_sequence = std::stoll(val); } catch (...) {} }
811 else if (kv.key == "signetstack.audit.first_hash") meta.first_hash = val;
812 else if (kv.key == "signetstack.audit.last_hash") meta.last_hash = val;
813 else if (kv.key == "signetstack.audit.prev_file_hash") meta.prev_file_hash = val;
814 else if (kv.key == "signetstack.audit.record_count") { try { meta.record_count = std::stoll(val); } catch (...) {} }
815 else if (kv.key == "signetstack.audit.record_type") meta.record_type = val;
816 }
817
818 return meta;
819 }
820
830 auto validation = validate_loaded_columns();
831 if (!validation) {
833 bad.valid = false;
834 bad.entries_checked = 0;
835 bad.first_bad_index = 0;
836 bad.error_message = validation.error().message;
837 return bad;
838 }
839
840 // Reconstruct chain entries from stored columns
841 std::vector<HashChainEntry> entries;
842 size_t n = col_chain_seq_.size();
843 entries.reserve(n);
844
845 for (size_t i = 0; i < n; ++i) {
846 HashChainEntry entry;
847 entry.sequence_number = col_chain_seq_[i];
848 entry.timestamp_ns = col_timestamp_ns_[i];
849 auto eh = hex_to_hash(col_chain_hash_[i]);
850 auto ph = hex_to_hash(col_prev_hash_[i]);
851 if (!eh || !ph) {
852 // Hash deserialization failure — chain is invalid
854 bad.valid = false;
855 bad.entries_checked = static_cast<int64_t>(i);
856 bad.first_bad_index = static_cast<int64_t>(i);
857 bad.error_message = !eh ? "entry_hash deserialization failed at record "
858 + std::to_string(i)
859 : "prev_hash deserialization failed at record "
860 + std::to_string(i);
861 return bad;
862 }
863 entry.entry_hash = *eh;
864 entry.prev_hash = *ph;
865
866 // Re-compute data_hash from the record
867 InferenceRecord rec;
868 rec.timestamp_ns = col_timestamp_ns_[i];
869 rec.model_id = col_model_id_[i];
870 rec.model_version = col_model_version_[i];
871 rec.inference_type = static_cast<InferenceType>(col_inference_type_[i]);
872 if (col_inference_type_[i] < 0 || (col_inference_type_[i] > 6 && col_inference_type_[i] != 255))
874 rec.input_embedding = detail::json_to_embedding(col_input_embedding_[i]);
875 rec.input_hash = col_input_hash_[i];
876 rec.output_hash = col_output_hash_[i];
877 rec.output_score = static_cast<float>(col_output_score_[i]);
878 rec.latency_ns = col_latency_ns_[i];
879 rec.batch_size = col_batch_size_[i];
880 rec.input_tokens = col_input_tokens_[i];
881 rec.output_tokens = col_output_tokens_[i];
882 rec.user_id_hash = col_user_id_hash_[i];
883 rec.session_id = col_session_id_[i];
884 rec.metadata_json = col_metadata_json_[i];
885
886 auto data = rec.serialize();
887 entry.data_hash = crypto::detail::sha256::sha256(data.data(), data.size());
888
889 entries.push_back(std::move(entry));
890 }
891
892 // Verify using AuditChainVerifier
893 AuditChainVerifier verifier;
894 return verifier.verify(entries);
895 }
896
898 [[nodiscard]] inline const Schema& schema() const {
899 return reader_->schema();
900 }
901
903 [[nodiscard]] inline int64_t num_records() const {
904 return reader_->num_rows();
905 }
906
907private:
908 std::unique_ptr<ParquetReader> reader_;
909 std::string path_;
910
911 // Column data (loaded once on open)
912 std::vector<int64_t> col_timestamp_ns_;
913 std::vector<std::string> col_model_id_;
914 std::vector<std::string> col_model_version_;
915 std::vector<int32_t> col_inference_type_;
916 std::vector<std::string> col_input_embedding_;
917 std::vector<std::string> col_input_hash_;
918 std::vector<std::string> col_output_hash_;
919 std::vector<double> col_output_score_;
920 std::vector<int64_t> col_latency_ns_;
921 std::vector<int32_t> col_batch_size_;
922 std::vector<int32_t> col_input_tokens_;
923 std::vector<int32_t> col_output_tokens_;
924 std::vector<std::string> col_user_id_hash_;
925 std::vector<std::string> col_session_id_;
926 std::vector<std::string> col_metadata_json_;
927 std::vector<int64_t> col_chain_seq_;
928 std::vector<std::string> col_chain_hash_;
929 std::vector<std::string> col_prev_hash_;
930
931 [[nodiscard]] inline expected<void> validate_loaded_columns() const {
932 const size_t expected_rows = col_timestamp_ns_.size();
933 const auto mismatch = [&](const char* column_name, size_t actual_rows)
934 -> expected<void> {
935 return Error{
937 "InferenceLogReader: column '" + std::string(column_name) +
938 "' row count mismatch in '" + path_ + "' (expected " +
939 std::to_string(expected_rows) + ", got " +
940 std::to_string(actual_rows) + ")"};
941 };
942
943 if (col_model_id_.size() != expected_rows)
944 return mismatch("model_id", col_model_id_.size());
945 if (col_model_version_.size() != expected_rows)
946 return mismatch("model_version", col_model_version_.size());
947 if (col_inference_type_.size() != expected_rows)
948 return mismatch("inference_type", col_inference_type_.size());
949 if (col_input_embedding_.size() != expected_rows)
950 return mismatch("input_embedding", col_input_embedding_.size());
951 if (col_input_hash_.size() != expected_rows)
952 return mismatch("input_hash", col_input_hash_.size());
953 if (col_output_hash_.size() != expected_rows)
954 return mismatch("output_hash", col_output_hash_.size());
955 if (col_output_score_.size() != expected_rows)
956 return mismatch("output_score", col_output_score_.size());
957 if (col_latency_ns_.size() != expected_rows)
958 return mismatch("latency_ns", col_latency_ns_.size());
959 if (col_batch_size_.size() != expected_rows)
960 return mismatch("batch_size", col_batch_size_.size());
961 if (col_input_tokens_.size() != expected_rows)
962 return mismatch("input_tokens", col_input_tokens_.size());
963 if (col_output_tokens_.size() != expected_rows)
964 return mismatch("output_tokens", col_output_tokens_.size());
965 if (col_user_id_hash_.size() != expected_rows)
966 return mismatch("user_id_hash", col_user_id_hash_.size());
967 if (col_session_id_.size() != expected_rows)
968 return mismatch("session_id", col_session_id_.size());
969 if (col_metadata_json_.size() != expected_rows)
970 return mismatch("metadata_json", col_metadata_json_.size());
971 if (col_chain_seq_.size() != expected_rows)
972 return mismatch("chain_seq", col_chain_seq_.size());
973 if (col_chain_hash_.size() != expected_rows)
974 return mismatch("chain_hash", col_chain_hash_.size());
975 if (col_prev_hash_.size() != expected_rows)
976 return mismatch("prev_hash", col_prev_hash_.size());
977
978 return expected<void>{};
979 }
980
982 [[nodiscard]] inline expected<void> load_columns() {
983 int64_t num_rgs = reader_->num_row_groups();
984
985 for (int64_t rg = 0; rg < num_rgs; ++rg) {
986 size_t rg_idx = static_cast<size_t>(rg);
987
988 // Column 0: timestamp_ns (INT64)
989 auto r0 = reader_->read_column<int64_t>(rg_idx, 0);
990 if (!r0) return r0.error();
991 col_timestamp_ns_.insert(col_timestamp_ns_.end(), r0->begin(), r0->end());
992
993 // Column 1: model_id (STRING)
994 auto r1 = reader_->read_column<std::string>(rg_idx, 1);
995 if (!r1) return r1.error();
996 col_model_id_.insert(col_model_id_.end(),
997 std::make_move_iterator(r1->begin()), std::make_move_iterator(r1->end()));
998
999 // Column 2: model_version (STRING)
1000 auto r2 = reader_->read_column<std::string>(rg_idx, 2);
1001 if (!r2) return r2.error();
1002 col_model_version_.insert(col_model_version_.end(),
1003 std::make_move_iterator(r2->begin()), std::make_move_iterator(r2->end()));
1004
1005 // Column 3: inference_type (INT32)
1006 auto r3 = reader_->read_column<int32_t>(rg_idx, 3);
1007 if (!r3) return r3.error();
1008 col_inference_type_.insert(col_inference_type_.end(), r3->begin(), r3->end());
1009
1010 // Column 4: input_embedding (STRING — JSON array)
1011 auto r4 = reader_->read_column<std::string>(rg_idx, 4);
1012 if (!r4) return r4.error();
1013 col_input_embedding_.insert(col_input_embedding_.end(),
1014 std::make_move_iterator(r4->begin()), std::make_move_iterator(r4->end()));
1015
1016 // Column 5: input_hash (STRING)
1017 auto r5 = reader_->read_column<std::string>(rg_idx, 5);
1018 if (!r5) return r5.error();
1019 col_input_hash_.insert(col_input_hash_.end(),
1020 std::make_move_iterator(r5->begin()), std::make_move_iterator(r5->end()));
1021
1022 // Column 6: output_hash (STRING)
1023 auto r6 = reader_->read_column<std::string>(rg_idx, 6);
1024 if (!r6) return r6.error();
1025 col_output_hash_.insert(col_output_hash_.end(),
1026 std::make_move_iterator(r6->begin()), std::make_move_iterator(r6->end()));
1027
1028 // Column 7: output_score (DOUBLE)
1029 auto r7 = reader_->read_column<double>(rg_idx, 7);
1030 if (!r7) return r7.error();
1031 col_output_score_.insert(col_output_score_.end(), r7->begin(), r7->end());
1032
1033 // Column 8: latency_ns (INT64)
1034 auto r8 = reader_->read_column<int64_t>(rg_idx, 8);
1035 if (!r8) return r8.error();
1036 col_latency_ns_.insert(col_latency_ns_.end(), r8->begin(), r8->end());
1037
1038 // Column 9: batch_size (INT32)
1039 auto r9 = reader_->read_column<int32_t>(rg_idx, 9);
1040 if (!r9) return r9.error();
1041 col_batch_size_.insert(col_batch_size_.end(), r9->begin(), r9->end());
1042
1043 // Column 10: input_tokens (INT32)
1044 auto r10 = reader_->read_column<int32_t>(rg_idx, 10);
1045 if (!r10) return r10.error();
1046 col_input_tokens_.insert(col_input_tokens_.end(), r10->begin(), r10->end());
1047
1048 // Column 11: output_tokens (INT32)
1049 auto r11 = reader_->read_column<int32_t>(rg_idx, 11);
1050 if (!r11) return r11.error();
1051 col_output_tokens_.insert(col_output_tokens_.end(), r11->begin(), r11->end());
1052
1053 // Column 12: user_id_hash (STRING)
1054 auto r12 = reader_->read_column<std::string>(rg_idx, 12);
1055 if (!r12) return r12.error();
1056 col_user_id_hash_.insert(col_user_id_hash_.end(),
1057 std::make_move_iterator(r12->begin()), std::make_move_iterator(r12->end()));
1058
1059 // Column 13: session_id (STRING)
1060 auto r13 = reader_->read_column<std::string>(rg_idx, 13);
1061 if (!r13) return r13.error();
1062 col_session_id_.insert(col_session_id_.end(),
1063 std::make_move_iterator(r13->begin()), std::make_move_iterator(r13->end()));
1064
1065 // Column 14: metadata_json (STRING)
1066 auto r14 = reader_->read_column<std::string>(rg_idx, 14);
1067 if (!r14) return r14.error();
1068 col_metadata_json_.insert(col_metadata_json_.end(),
1069 std::make_move_iterator(r14->begin()), std::make_move_iterator(r14->end()));
1070
1071 // Column 15: chain_seq (INT64)
1072 auto r15 = reader_->read_column<int64_t>(rg_idx, 15);
1073 if (!r15) return r15.error();
1074 col_chain_seq_.insert(col_chain_seq_.end(), r15->begin(), r15->end());
1075
1076 // Column 16: chain_hash (STRING)
1077 auto r16 = reader_->read_column<std::string>(rg_idx, 16);
1078 if (!r16) return r16.error();
1079 col_chain_hash_.insert(col_chain_hash_.end(),
1080 std::make_move_iterator(r16->begin()), std::make_move_iterator(r16->end()));
1081
1082 // Column 17: prev_hash (STRING)
1083 auto r17 = reader_->read_column<std::string>(rg_idx, 17);
1084 if (!r17) return r17.error();
1085 col_prev_hash_.insert(col_prev_hash_.end(),
1086 std::make_move_iterator(r17->begin()), std::make_move_iterator(r17->end()));
1087 }
1088
1089 return validate_loaded_columns();
1090 }
1091};
1092
1093} // namespace signet::forge
Verifies hash chain integrity.
static VerificationResult verify(const uint8_t *chain_data, size_t chain_size)
Verify a chain from serialized bytes.
Builds SHA-256 hash chains during Parquet writes.
const std::vector< HashChainEntry > & entries() const
Return a const reference to the internal entry list.
HashChainEntry append(const uint8_t *record_data, size_t record_size, int64_t timestamp_ns)
Append a record to the chain with an explicit timestamp.
Reads ML inference log Parquet files and verifies hash chain integrity.
InferenceLogReader(const InferenceLogReader &)=delete
InferenceLogReader(InferenceLogReader &&)=default
const Schema & schema() const
Get the schema of the inference log file.
int64_t num_records() const
Number of records in the file.
InferenceLogReader & operator=(const InferenceLogReader &)=delete
expected< std::vector< InferenceRecord > > read_all() const
Get all inference records from the file.
static expected< InferenceLogReader > open(const std::string &path)
Open an inference log Parquet file and pre-load all column data.
expected< AuditMetadata > audit_metadata() const
Get the audit chain metadata from the Parquet file's key-value metadata.
InferenceLogReader & operator=(InferenceLogReader &&)=default
AuditChainVerifier::VerificationResult verify_chain() const
Verify the hash chain integrity by re-hashing each record and checking chain continuity.
Writes ML inference records to Parquet files with cryptographic hash chaining for tamper-evident audi...
expected< void > close()
Close the writer (flushes remaining records).
size_t pending_records() const
Get the number of records in the current (unflushed) batch.
InferenceLogWriter(const std::string &output_dir, const std::string &chain_id="", size_t max_records=100000)
Create an inference log writer.
expected< void > flush()
Flush current records to a Parquet file.
expected< HashChainEntry > log(const InferenceRecord &record)
Log an inference event.
AuditMetadata current_metadata() const
Get the chain metadata for the current batch.
std::string current_file_path() const
Get the file path of the current (or last written) output file.
int64_t total_records() const
Get the total number of records written across all files.
static expected< ParquetReader > open(const std::filesystem::path &path)
Open and parse a Parquet file, returning a ready-to-query reader.
Definition reader.hpp:189
static expected< ParquetWriter > open(const std::filesystem::path &path, const Schema &schema, const Options &options=Options{})
Open a new Parquet file for writing.
Definition writer.hpp:303
Per-row lineage tracking inspired by Iceberg V3-style data governance.
RowLineage next(const uint8_t *row_data, size_t row_size)
Generate lineage for the next row.
SchemaBuilder & column(std::string col_name, LogicalType logical_type=LogicalType::NONE)
Add a typed column, deducing PhysicalType from T.
Definition schema.hpp:107
Schema build()
Build the final Schema, consuming the builder.
Definition schema.hpp:303
Immutable schema description for a Parquet file.
Definition schema.hpp:192
static SchemaBuilder builder(std::string name)
Create a SchemaBuilder for fluent column construction.
Definition schema.hpp:228
const Error & error() const
Access the error payload (valid for both success and failure; check ok() on the returned Error).
Definition error.hpp:261
A lightweight result type that holds either a success value of type T or an Error.
Definition error.hpp:145
std::array< uint8_t, 32 > sha256(const uint8_t *data, size_t size)
Compute SHA-256 hash of arbitrary-length input.
Definition sha256.hpp:165
int64_t now_ns()
Return the current time as nanoseconds since the Unix epoch (UTC).
expected< std::array< uint8_t, 32 > > hex_to_hash(const std::string &hex)
Convert a 64-character lowercase hex string back to a 32-byte hash.
InferenceType
Classification of the ML inference operation.
@ REGRESSION
Continuous value prediction.
@ CUSTOM
Application-specific inference type.
@ CLASSIFICATION
Binary or multi-class classification.
@ RECOMMENDATION
Recommendation system inference.
@ EMBEDDING
Vector embedding computation.
@ GENERATION
LLM text generation.
@ ANOMALY
Anomaly/outlier detection.
@ RANKING
Ranking/scoring of candidates.
@ TIMESTAMP_NS
Timestamp — INT64, nanoseconds since Unix epoch.
Schema inference_log_schema()
Build the Parquet schema for ML inference log files.
std::string hash_to_hex(const std::array< uint8_t, 32 > &hash)
Convert a 32-byte SHA-256 hash to a lowercase hexadecimal string (64 chars).
@ CORRUPT_PAGE
A data page failed integrity checks (bad CRC, truncated, or exceeds size limits).
@ CORRUPT_DATA
Decoded data is corrupt or inconsistent (e.g. out-of-range dictionary index).
std::string generate_chain_id()
Generate a simple chain identifier based on the current timestamp.
Per-row lineage tracking (Iceberg V3-style) with monotonic row IDs, mutation versioning,...
Schema definition types: Column<T>, SchemaBuilder, and Schema.
bool valid
True if the entire chain passed all integrity checks.
int64_t first_bad_index
Index of the first entry that failed verification, or -1 if all entries are valid.
int64_t entries_checked
Number of entries that were successfully verified before a failure was detected (or the total count i...
std::string error_message
Human-readable description of the verification outcome.
Chain summary stored in Parquet key-value metadata.
int64_t end_sequence
Sequence number of the last entry in this file's chain segment.
int64_t record_count
Number of audit records in this segment.
std::string last_hash
Hex string of the last entry's entry_hash in this segment.
std::string chain_id
Unique identifier for this chain (generated by generate_chain_id()).
std::string record_type
Record type: "decision", "inference", etc.
std::string first_hash
Hex string of the first entry's entry_hash in this segment.
std::string prev_file_hash
Hex string of the first entry's prev_hash (links to the prior file).
int64_t start_sequence
Sequence number of the first entry in this file's chain segment.
Lightweight error value carrying an ErrorCode and a human-readable message.
Definition error.hpp:101
A single link in the cryptographic hash chain.
int64_t sequence_number
0-indexed position in the chain, monotonically increasing.
std::array< uint8_t, 32 > entry_hash
SHA-256 commitment over (sequence_number, timestamp_ns, prev_hash, data_hash).
int64_t timestamp_ns
Nanoseconds since Unix epoch when this entry was created.
std::array< uint8_t, 32 > prev_hash
SHA-256 hash of the previous entry (all zeros for the first entry, or a user-supplied continuation ha...
std::array< uint8_t, 32 > data_hash
SHA-256 hash of the record/row data that this entry covers.
A single ML inference event with full operational metadata.
std::string training_dataset_id
Training data identifier.
std::string model_retraining_schedule
Cron or description of retraining schedule (EU AI Act Art.13)
std::string model_id
Model identifier (e.g., "gpt-4", "bert-base")
std::string output_hash
SHA-256 hash of raw output.
int64_t timestamp_ns
Inference timestamp (nanoseconds since epoch)
int64_t model_training_end_ns
Timestamp when model training completed (EU AI Act Art.12)
std::vector< uint8_t > serialize() const
Serialize the record to a deterministic byte sequence.
std::vector< float > input_embedding
Input embedding (optional, may be empty)
int64_t training_dataset_size
Number of samples in training dataset.
int64_t model_training_data_cutoff_ns
Latest data timestamp used in training.
static expected< InferenceRecord > deserialize(const uint8_t *data, size_t size)
Reconstruct an InferenceRecord from its serialized byte representation.
int64_t latency_ns
Inference latency in nanoseconds.
int32_t output_tokens
Output token count (LLM, 0 if N/A)
std::string session_id
Session identifier.
float output_score
Primary output score/probability.
std::string user_id_hash
Hashed user ID (for privacy)
InferenceType inference_type
Type of inference.
std::string metadata_json
Additional JSON metadata.
std::string model_version
Model version hash or checkpoint ID.
std::string training_data_characteristics
Description of training data properties.
int32_t input_tokens
Input token count (LLM, 0 if N/A)
std::string input_hash
SHA-256 hash of raw input (for privacy)
Configuration options for ParquetWriter.
Definition writer.hpp:188
std::vector< thrift::KeyValue > file_metadata
Custom key-value metadata pairs embedded in the Parquet footer.
Definition writer.hpp:198
std::string created_by
Value written into the Parquet footer's "created_by" field.
Definition writer.hpp:195
Parquet KeyValue metadata entry (parquet.thrift field IDs 1-2).
Definition types.hpp:468
Parquet format enumerations, type traits, and statistics structs.