6#if !defined(SIGNET_ENABLE_COMMERCIAL) || !SIGNET_ENABLE_COMMERCIAL
7#error "signet/ai/inference_log.hpp requires SIGNET_ENABLE_COMMERCIAL=ON (AGPL-3.0 commercial tier). See LICENSE_COMMERCIAL."
97 [[nodiscard]]
inline std::vector<uint8_t>
serialize()
const {
98 std::vector<uint8_t> buf;
116 append_float(buf, f);
129 append_le64(buf,
static_cast<uint64_t
>(
latency_ns));
132 append_le32(buf,
static_cast<uint32_t
>(
batch_size));
168 const uint8_t* data,
size_t size) {
175 if (!read_string(data, size, offset, rec.
model_id)) {
183 if (!read_le32(data, size, offset, it_val)) {
187 if (it_val < 0 || (it_val > 6 && it_val != 255))
190 uint32_t emb_count = 0;
191 if (!read_le32_u(data, size, offset, emb_count)) {
194 if (offset > size || emb_count > (size - offset) /
sizeof(
float)) {
198 for (uint32_t i = 0; i < emb_count; ++i) {
204 if (!read_string(data, size, offset, rec.
input_hash)) {
207 if (!read_string(data, size, offset, rec.
output_hash)) {
210 if (!read_float(data, size, offset, rec.
output_score)) {
213 if (!read_le64(data, size, offset, rec.
latency_ns)) {
216 if (!read_le32(data, size, offset, rec.
batch_size)) {
225 if (!read_string(data, size, offset, rec.
user_id_hash)) {
228 if (!read_string(data, size, offset, rec.
session_id)) {
268 static inline void append_le32(std::vector<uint8_t>& buf, uint32_t v) {
269 buf.push_back(
static_cast<uint8_t
>(v));
270 buf.push_back(
static_cast<uint8_t
>(v >> 8));
271 buf.push_back(
static_cast<uint8_t
>(v >> 16));
272 buf.push_back(
static_cast<uint8_t
>(v >> 24));
275 static inline void append_le64(std::vector<uint8_t>& buf, uint64_t v) {
276 for (
int i = 0; i < 8; ++i) {
277 buf.push_back(
static_cast<uint8_t
>(v >> (i * 8)));
281 static inline void append_float(std::vector<uint8_t>& buf,
float v) {
283 std::memcpy(&bits, &v, 4);
284 append_le32(buf, bits);
287 static inline void append_string(std::vector<uint8_t>& buf,
const std::string& s) {
289 const size_t clamped = (std::min)(s.size(),
static_cast<size_t>(UINT32_MAX));
290 append_le32(buf,
static_cast<uint32_t
>(clamped));
291 buf.insert(buf.end(), s.begin(), s.begin() +
static_cast<ptrdiff_t
>(clamped));
296 static inline bool read_le64(
const uint8_t* data,
size_t size,
size_t& offset, int64_t& out) {
297 if (offset + 8 > size)
return false;
299 for (
int i = 0; i < 8; ++i) {
300 v |=
static_cast<uint64_t
>(data[offset + i]) << (i * 8);
302 out =
static_cast<int64_t
>(v);
307 static inline bool read_le32(
const uint8_t* data,
size_t size,
size_t& offset, int32_t& out) {
308 if (offset + 4 > size)
return false;
310 for (
int i = 0; i < 4; ++i) {
311 v |=
static_cast<uint32_t
>(data[offset + i]) << (i * 8);
313 out =
static_cast<int32_t
>(v);
318 static inline bool read_le32_u(
const uint8_t* data,
size_t size,
size_t& offset, uint32_t& out) {
319 if (offset + 4 > size)
return false;
321 for (
int i = 0; i < 4; ++i) {
322 out |=
static_cast<uint32_t
>(data[offset + i]) << (i * 8);
328 static inline bool read_float(
const uint8_t* data,
size_t size,
size_t& offset,
float& out) {
329 if (offset + 4 > size)
return false;
331 for (
int i = 0; i < 4; ++i) {
332 bits |=
static_cast<uint32_t
>(data[offset + i]) << (i * 8);
334 std::memcpy(&out, &bits, 4);
339 static constexpr uint32_t MAX_STRING_LEN = 16u * 1024u * 1024u;
341 static inline bool read_string(
const uint8_t* data,
size_t size,
size_t& offset, std::string& out) {
343 if (!read_le32_u(data, size, offset, len))
return false;
344 if (len > MAX_STRING_LEN)
return false;
345 if (offset + len > size)
return false;
346 out.assign(
reinterpret_cast<const char*
>(data + offset), len);
361inline std::string embedding_to_json(
const std::vector<float>& embedding) {
362 if (embedding.empty())
return "[]";
364 std::string result =
"[";
365 for (
size_t i = 0; i < embedding.size(); ++i) {
366 if (i > 0) result +=
',';
368 std::snprintf(buf,
sizeof(buf),
"%.8g",
static_cast<double>(embedding[i]));
381inline std::vector<float> json_to_embedding(
const std::string& json) {
383 static constexpr size_t MAX_JSON_ARRAY_ELEMENTS = 1'000'000;
384 std::vector<float> result;
385 if (json.size() < 2 || json.front() !=
'[' || json.back() !=
']') {
390 size_t end = json.size() - 1;
393 if (result.size() >= MAX_JSON_ARRAY_ELEMENTS)
break;
394 while (pos < end && (json[pos] ==
' ' || json[pos] ==
'\t')) ++pos;
395 if (pos >= end)
break;
397 size_t num_start = pos;
398 while (pos < end && json[pos] !=
',') ++pos;
400 std::string num_str = json.substr(num_start, pos - num_start);
401 while (!num_str.empty() && (num_str.back() ==
' ' || num_str.back() ==
'\t')) {
404 if (!num_str.empty()) {
406 result.push_back(std::stof(num_str));
412 if (pos < end && json[pos] ==
',') ++pos;
444 .column<std::string>(
"model_id")
445 .
column<std::string>(
"model_version")
446 .column<int32_t>(
"inference_type")
447 .
column<std::string>(
"input_embedding")
448 .column<std::string>(
"input_hash")
449 .
column<std::string>(
"output_hash")
450 .column<double>(
"output_score")
451 .
column<int64_t>(
"latency_ns")
452 .column<int32_t>(
"batch_size")
453 .
column<int32_t>(
"input_tokens")
454 .column<int32_t>(
"output_tokens")
455 .
column<std::string>(
"user_id_hash")
456 .column<std::string>(
"session_id")
457 .
column<std::string>(
"metadata_json")
458 .column<int64_t>(
"chain_seq")
459 .
column<std::string>(
"chain_hash")
460 .column<std::string>(
"prev_hash")
461 .
column<int64_t>(
"row_id")
462 .column<int32_t>(
"row_version")
463 .
column<std::string>(
"row_origin_file")
464 .column<std::string>(
"row_prev_hash")
493 const std::string& chain_id =
"",
494 size_t max_records = 100000)
495 : output_dir_(output_dir)
497 , max_records_(max_records)
499 , lineage_tracker_(chain_id.empty() ? chain_id_ : chain_id, 1)
501 auto license = commercial::require_feature(
"InferenceLogWriter");
503 throw std::runtime_error(license.error().message);
507 if (output_dir_.empty()) {
508 throw std::invalid_argument(
"InferenceLogWriter: output_dir must not be empty");
510 for (
size_t s = 0, e; s <= output_dir_.size(); s = e + 1) {
511 e = output_dir_.find_first_of(
"/\\", s);
512 if (e == std::string::npos) e = output_dir_.size();
513 if (output_dir_.substr(s, e - s) ==
"..") {
514 throw std::invalid_argument(
515 "InferenceLogWriter: output_dir must not contain '..' path traversal");
518 for (
char c : chain_id_) {
519 if (!std::isalnum(
static_cast<unsigned char>(c)) && c !=
'_' && c !=
'-') {
520 throw std::invalid_argument(
521 "InferenceLogWriter: chain_id must only contain [a-zA-Z0-9_-]");
532 auto usage = commercial::record_usage_rows(
"InferenceLogWriter::log", 1);
533 if (!usage)
return usage.error();
545 auto entry = chain_.
append(data.data(), data.size(), ts);
548 pending_records_.push_back(record);
549 pending_entries_.push_back(entry);
550 pending_data_.push_back(std::move(data));
553 if (pending_records_.size() >= max_records_) {
554 auto result =
flush();
555 if (!result)
return result.error();
568 if (pending_records_.empty()) {
573 int64_t start_seq = pending_entries_.front().sequence_number;
574 int64_t end_seq = pending_entries_.back().sequence_number;
576 current_file_path_ = output_dir_ +
"/inference_log_" + chain_id_ +
"_"
577 + std::to_string(start_seq) +
"_"
578 + std::to_string(end_seq) +
".parquet";
582 opts.
created_by =
"SignetStack signet-forge inference_log v1.0";
586 auto meta_kvs = meta.to_key_values();
587 for (
auto& [k, v] : meta_kvs) {
593 if (!writer_result)
return writer_result.error();
594 auto& writer = *writer_result;
597 size_t n = pending_records_.size();
598 for (
size_t i = 0; i < n; ++i) {
599 const auto& rec = pending_records_[i];
600 const auto& entry = pending_entries_[i];
603 const auto& row_data = pending_data_[i];
604 auto lineage = lineage_tracker_.
next(row_data.data(), row_data.size());
606 std::vector<std::string> row;
609 row.push_back(std::to_string(rec.timestamp_ns));
610 row.push_back(rec.model_id);
611 row.push_back(rec.model_version);
612 row.push_back(std::to_string(
static_cast<int32_t
>(rec.inference_type)));
613 row.push_back(detail::embedding_to_json(rec.input_embedding));
614 row.push_back(rec.input_hash);
615 row.push_back(rec.output_hash);
616 row.push_back(double_to_string(
static_cast<double>(rec.output_score)));
617 row.push_back(std::to_string(rec.latency_ns));
618 row.push_back(std::to_string(rec.batch_size));
619 row.push_back(std::to_string(rec.input_tokens));
620 row.push_back(std::to_string(rec.output_tokens));
621 row.push_back(rec.user_id_hash);
622 row.push_back(rec.session_id);
623 row.push_back(rec.metadata_json);
624 row.push_back(std::to_string(entry.sequence_number));
627 row.push_back(std::to_string(lineage.row_id));
628 row.push_back(std::to_string(lineage.row_version));
629 row.push_back(lineage.row_origin_file);
630 row.push_back(lineage.row_prev_hash);
632 auto write_result = writer.write_row(row);
633 if (!write_result)
return write_result.error();
637 auto close_result = writer.close();
638 if (!close_result)
return close_result.error();
641 pending_records_.clear();
642 pending_entries_.clear();
643 pending_data_.clear();
651 if (!pending_records_.empty()) {
665 if (!pending_entries_.empty()) {
667 meta.
end_sequence = pending_entries_.back().sequence_number;
671 }
else if (!chain_.
entries().empty()) {
672 const auto& last = chain_.
entries().back();
679 meta.
record_count =
static_cast<int64_t
>(pending_entries_.size());
686 return pending_records_.size();
691 return total_records_;
696 return current_file_path_;
700 std::string output_dir_;
701 std::string chain_id_;
705 std::vector<InferenceRecord> pending_records_;
706 std::vector<HashChainEntry> pending_entries_;
707 std::vector<std::vector<uint8_t>> pending_data_;
709 std::string current_file_path_;
710 int64_t total_records_{0};
711 int64_t file_count_{0};
714 static inline std::string double_to_string(
double v) {
716 std::snprintf(buf,
sizeof(buf),
"%.17g", v);
735 auto license = commercial::require_feature(
"InferenceLogReader");
736 if (!license)
return license.error();
739 if (!reader_result)
return reader_result.error();
742 ilr.reader_ = std::make_unique<ParquetReader>(std::move(*reader_result));
746 auto load_result = ilr.load_columns();
747 if (!load_result)
return load_result.
error();
765 auto validation = validate_loaded_columns();
766 if (!validation)
return validation.error();
768 size_t n = col_timestamp_ns_.size();
769 std::vector<InferenceRecord> records;
772 for (
size_t i = 0; i < n; ++i) {
778 if (col_inference_type_[i] < 0 || (col_inference_type_[i] > 6 && col_inference_type_[i] != 255))
780 rec.
input_embedding = detail::json_to_embedding(col_input_embedding_[i]);
783 rec.
output_score =
static_cast<float>(col_output_score_[i]);
791 records.push_back(std::move(rec));
801 const auto& kvs = reader_->key_value_metadata();
804 for (
const auto& kv : kvs) {
805 if (!kv.value.has_value())
continue;
806 const auto& val = *kv.value;
808 if (kv.key ==
"signetstack.audit.chain_id") meta.
chain_id = val;
809 else if (kv.key ==
"signetstack.audit.first_seq") {
try { meta.
start_sequence = std::stoll(val); }
catch (...) {} }
810 else if (kv.key ==
"signetstack.audit.last_seq") {
try { meta.
end_sequence = std::stoll(val); }
catch (...) {} }
811 else if (kv.key ==
"signetstack.audit.first_hash") meta.
first_hash = val;
812 else if (kv.key ==
"signetstack.audit.last_hash") meta.
last_hash = val;
813 else if (kv.key ==
"signetstack.audit.prev_file_hash") meta.
prev_file_hash = val;
814 else if (kv.key ==
"signetstack.audit.record_count") {
try { meta.
record_count = std::stoll(val); }
catch (...) {} }
815 else if (kv.key ==
"signetstack.audit.record_type") meta.
record_type = val;
830 auto validation = validate_loaded_columns();
841 std::vector<HashChainEntry> entries;
842 size_t n = col_chain_seq_.size();
845 for (
size_t i = 0; i < n; ++i) {
857 bad.
error_message = !eh ?
"entry_hash deserialization failed at record "
859 :
"prev_hash deserialization failed at record "
872 if (col_inference_type_[i] < 0 || (col_inference_type_[i] > 6 && col_inference_type_[i] != 255))
874 rec.
input_embedding = detail::json_to_embedding(col_input_embedding_[i]);
877 rec.
output_score =
static_cast<float>(col_output_score_[i]);
889 entries.push_back(std::move(entry));
894 return verifier.
verify(entries);
899 return reader_->schema();
904 return reader_->num_rows();
908 std::unique_ptr<ParquetReader> reader_;
912 std::vector<int64_t> col_timestamp_ns_;
913 std::vector<std::string> col_model_id_;
914 std::vector<std::string> col_model_version_;
915 std::vector<int32_t> col_inference_type_;
916 std::vector<std::string> col_input_embedding_;
917 std::vector<std::string> col_input_hash_;
918 std::vector<std::string> col_output_hash_;
919 std::vector<double> col_output_score_;
920 std::vector<int64_t> col_latency_ns_;
921 std::vector<int32_t> col_batch_size_;
922 std::vector<int32_t> col_input_tokens_;
923 std::vector<int32_t> col_output_tokens_;
924 std::vector<std::string> col_user_id_hash_;
925 std::vector<std::string> col_session_id_;
926 std::vector<std::string> col_metadata_json_;
927 std::vector<int64_t> col_chain_seq_;
928 std::vector<std::string> col_chain_hash_;
929 std::vector<std::string> col_prev_hash_;
931 [[nodiscard]]
inline expected<void> validate_loaded_columns()
const {
932 const size_t expected_rows = col_timestamp_ns_.size();
933 const auto mismatch = [&](
const char* column_name,
size_t actual_rows)
937 "InferenceLogReader: column '" + std::string(column_name) +
938 "' row count mismatch in '" + path_ +
"' (expected " +
939 std::to_string(expected_rows) +
", got " +
940 std::to_string(actual_rows) +
")"};
943 if (col_model_id_.size() != expected_rows)
944 return mismatch(
"model_id", col_model_id_.size());
945 if (col_model_version_.size() != expected_rows)
946 return mismatch(
"model_version", col_model_version_.size());
947 if (col_inference_type_.size() != expected_rows)
948 return mismatch(
"inference_type", col_inference_type_.size());
949 if (col_input_embedding_.size() != expected_rows)
950 return mismatch(
"input_embedding", col_input_embedding_.size());
951 if (col_input_hash_.size() != expected_rows)
952 return mismatch(
"input_hash", col_input_hash_.size());
953 if (col_output_hash_.size() != expected_rows)
954 return mismatch(
"output_hash", col_output_hash_.size());
955 if (col_output_score_.size() != expected_rows)
956 return mismatch(
"output_score", col_output_score_.size());
957 if (col_latency_ns_.size() != expected_rows)
958 return mismatch(
"latency_ns", col_latency_ns_.size());
959 if (col_batch_size_.size() != expected_rows)
960 return mismatch(
"batch_size", col_batch_size_.size());
961 if (col_input_tokens_.size() != expected_rows)
962 return mismatch(
"input_tokens", col_input_tokens_.size());
963 if (col_output_tokens_.size() != expected_rows)
964 return mismatch(
"output_tokens", col_output_tokens_.size());
965 if (col_user_id_hash_.size() != expected_rows)
966 return mismatch(
"user_id_hash", col_user_id_hash_.size());
967 if (col_session_id_.size() != expected_rows)
968 return mismatch(
"session_id", col_session_id_.size());
969 if (col_metadata_json_.size() != expected_rows)
970 return mismatch(
"metadata_json", col_metadata_json_.size());
971 if (col_chain_seq_.size() != expected_rows)
972 return mismatch(
"chain_seq", col_chain_seq_.size());
973 if (col_chain_hash_.size() != expected_rows)
974 return mismatch(
"chain_hash", col_chain_hash_.size());
975 if (col_prev_hash_.size() != expected_rows)
976 return mismatch(
"prev_hash", col_prev_hash_.size());
978 return expected<void>{};
982 [[nodiscard]]
inline expected<void> load_columns() {
983 int64_t num_rgs = reader_->num_row_groups();
985 for (int64_t rg = 0; rg < num_rgs; ++rg) {
986 size_t rg_idx =
static_cast<size_t>(rg);
989 auto r0 = reader_->read_column<int64_t>(rg_idx, 0);
990 if (!r0)
return r0.error();
991 col_timestamp_ns_.insert(col_timestamp_ns_.end(), r0->begin(), r0->end());
994 auto r1 = reader_->read_column<std::string>(rg_idx, 1);
995 if (!r1)
return r1.error();
996 col_model_id_.insert(col_model_id_.end(),
997 std::make_move_iterator(r1->begin()), std::make_move_iterator(r1->end()));
1000 auto r2 = reader_->read_column<std::string>(rg_idx, 2);
1001 if (!r2)
return r2.error();
1002 col_model_version_.insert(col_model_version_.end(),
1003 std::make_move_iterator(r2->begin()), std::make_move_iterator(r2->end()));
1006 auto r3 = reader_->read_column<int32_t>(rg_idx, 3);
1007 if (!r3)
return r3.error();
1008 col_inference_type_.insert(col_inference_type_.end(), r3->begin(), r3->end());
1011 auto r4 = reader_->read_column<std::string>(rg_idx, 4);
1012 if (!r4)
return r4.error();
1013 col_input_embedding_.insert(col_input_embedding_.end(),
1014 std::make_move_iterator(r4->begin()), std::make_move_iterator(r4->end()));
1017 auto r5 = reader_->read_column<std::string>(rg_idx, 5);
1018 if (!r5)
return r5.error();
1019 col_input_hash_.insert(col_input_hash_.end(),
1020 std::make_move_iterator(r5->begin()), std::make_move_iterator(r5->end()));
1023 auto r6 = reader_->read_column<std::string>(rg_idx, 6);
1024 if (!r6)
return r6.error();
1025 col_output_hash_.insert(col_output_hash_.end(),
1026 std::make_move_iterator(r6->begin()), std::make_move_iterator(r6->end()));
1029 auto r7 = reader_->read_column<
double>(rg_idx, 7);
1030 if (!r7)
return r7.error();
1031 col_output_score_.insert(col_output_score_.end(), r7->begin(), r7->end());
1034 auto r8 = reader_->read_column<int64_t>(rg_idx, 8);
1035 if (!r8)
return r8.error();
1036 col_latency_ns_.insert(col_latency_ns_.end(), r8->begin(), r8->end());
1039 auto r9 = reader_->read_column<int32_t>(rg_idx, 9);
1040 if (!r9)
return r9.error();
1041 col_batch_size_.insert(col_batch_size_.end(), r9->begin(), r9->end());
1044 auto r10 = reader_->read_column<int32_t>(rg_idx, 10);
1045 if (!r10)
return r10.error();
1046 col_input_tokens_.insert(col_input_tokens_.end(), r10->begin(), r10->end());
1049 auto r11 = reader_->read_column<int32_t>(rg_idx, 11);
1050 if (!r11)
return r11.error();
1051 col_output_tokens_.insert(col_output_tokens_.end(), r11->begin(), r11->end());
1054 auto r12 = reader_->read_column<std::string>(rg_idx, 12);
1055 if (!r12)
return r12.error();
1056 col_user_id_hash_.insert(col_user_id_hash_.end(),
1057 std::make_move_iterator(r12->begin()), std::make_move_iterator(r12->end()));
1060 auto r13 = reader_->read_column<std::string>(rg_idx, 13);
1061 if (!r13)
return r13.error();
1062 col_session_id_.insert(col_session_id_.end(),
1063 std::make_move_iterator(r13->begin()), std::make_move_iterator(r13->end()));
1066 auto r14 = reader_->read_column<std::string>(rg_idx, 14);
1067 if (!r14)
return r14.error();
1068 col_metadata_json_.insert(col_metadata_json_.end(),
1069 std::make_move_iterator(r14->begin()), std::make_move_iterator(r14->end()));
1072 auto r15 = reader_->read_column<int64_t>(rg_idx, 15);
1073 if (!r15)
return r15.error();
1074 col_chain_seq_.insert(col_chain_seq_.end(), r15->begin(), r15->end());
1077 auto r16 = reader_->read_column<std::string>(rg_idx, 16);
1078 if (!r16)
return r16.error();
1079 col_chain_hash_.insert(col_chain_hash_.end(),
1080 std::make_move_iterator(r16->begin()), std::make_move_iterator(r16->end()));
1083 auto r17 = reader_->read_column<std::string>(rg_idx, 17);
1084 if (!r17)
return r17.error();
1085 col_prev_hash_.insert(col_prev_hash_.end(),
1086 std::make_move_iterator(r17->begin()), std::make_move_iterator(r17->end()));
1089 return validate_loaded_columns();
Verifies hash chain integrity.
static VerificationResult verify(const uint8_t *chain_data, size_t chain_size)
Verify a chain from serialized bytes.
Builds SHA-256 hash chains during Parquet writes.
const std::vector< HashChainEntry > & entries() const
Return a const reference to the internal entry list.
HashChainEntry append(const uint8_t *record_data, size_t record_size, int64_t timestamp_ns)
Append a record to the chain with an explicit timestamp.
Reads ML inference log Parquet files and verifies hash chain integrity.
InferenceLogReader(const InferenceLogReader &)=delete
InferenceLogReader(InferenceLogReader &&)=default
InferenceLogReader()=default
const Schema & schema() const
Get the schema of the inference log file.
int64_t num_records() const
Number of records in the file.
InferenceLogReader & operator=(const InferenceLogReader &)=delete
expected< std::vector< InferenceRecord > > read_all() const
Get all inference records from the file.
static expected< InferenceLogReader > open(const std::string &path)
Open an inference log Parquet file and pre-load all column data.
expected< AuditMetadata > audit_metadata() const
Get the audit chain metadata from the Parquet file's key-value metadata.
InferenceLogReader & operator=(InferenceLogReader &&)=default
AuditChainVerifier::VerificationResult verify_chain() const
Verify the hash chain integrity by re-hashing each record and checking chain continuity.
Writes ML inference records to Parquet files with cryptographic hash chaining for tamper-evident audi...
expected< void > close()
Close the writer (flushes remaining records).
size_t pending_records() const
Get the number of records in the current (unflushed) batch.
InferenceLogWriter(const std::string &output_dir, const std::string &chain_id="", size_t max_records=100000)
Create an inference log writer.
expected< void > flush()
Flush current records to a Parquet file.
expected< HashChainEntry > log(const InferenceRecord &record)
Log an inference event.
AuditMetadata current_metadata() const
Get the chain metadata for the current batch.
std::string current_file_path() const
Get the file path of the current (or last written) output file.
int64_t total_records() const
Get the total number of records written across all files.
static expected< ParquetReader > open(const std::filesystem::path &path)
Open and parse a Parquet file, returning a ready-to-query reader.
static expected< ParquetWriter > open(const std::filesystem::path &path, const Schema &schema, const Options &options=Options{})
Open a new Parquet file for writing.
Per-row lineage tracking inspired by Iceberg V3-style data governance.
RowLineage next(const uint8_t *row_data, size_t row_size)
Generate lineage for the next row.
SchemaBuilder & column(std::string col_name, LogicalType logical_type=LogicalType::NONE)
Add a typed column, deducing PhysicalType from T.
Schema build()
Build the final Schema, consuming the builder.
Immutable schema description for a Parquet file.
static SchemaBuilder builder(std::string name)
Create a SchemaBuilder for fluent column construction.
const Error & error() const
Access the error payload (valid for both success and failure; check ok() on the returned Error).
A lightweight result type that holds either a success value of type T or an Error.
std::array< uint8_t, 32 > sha256(const uint8_t *data, size_t size)
Compute SHA-256 hash of arbitrary-length input.
int64_t now_ns()
Return the current time as nanoseconds since the Unix epoch (UTC).
expected< std::array< uint8_t, 32 > > hex_to_hash(const std::string &hex)
Convert a 64-character lowercase hex string back to a 32-byte hash.
InferenceType
Classification of the ML inference operation.
@ REGRESSION
Continuous value prediction.
@ CUSTOM
Application-specific inference type.
@ CLASSIFICATION
Binary or multi-class classification.
@ RECOMMENDATION
Recommendation system inference.
@ EMBEDDING
Vector embedding computation.
@ GENERATION
LLM text generation.
@ ANOMALY
Anomaly/outlier detection.
@ RANKING
Ranking/scoring of candidates.
@ TIMESTAMP_NS
Timestamp — INT64, nanoseconds since Unix epoch.
Schema inference_log_schema()
Build the Parquet schema for ML inference log files.
std::string hash_to_hex(const std::array< uint8_t, 32 > &hash)
Convert a 32-byte SHA-256 hash to a lowercase hexadecimal string (64 chars).
@ CORRUPT_PAGE
A data page failed integrity checks (bad CRC, truncated, or exceeds size limits).
@ CORRUPT_DATA
Decoded data is corrupt or inconsistent (e.g. out-of-range dictionary index).
std::string generate_chain_id()
Generate a simple chain identifier based on the current timestamp.
Per-row lineage tracking (Iceberg V3-style) with monotonic row IDs, mutation versioning,...
Schema definition types: Column<T>, SchemaBuilder, and Schema.
Result of a full chain verification.
bool valid
True if the entire chain passed all integrity checks.
int64_t first_bad_index
Index of the first entry that failed verification, or -1 if all entries are valid.
int64_t entries_checked
Number of entries that were successfully verified before a failure was detected (or the total count i...
std::string error_message
Human-readable description of the verification outcome.
Lightweight error value carrying an ErrorCode and a human-readable message.
A single link in the cryptographic hash chain.
int64_t sequence_number
0-indexed position in the chain, monotonically increasing.
std::array< uint8_t, 32 > entry_hash
SHA-256 commitment over (sequence_number, timestamp_ns, prev_hash, data_hash).
int64_t timestamp_ns
Nanoseconds since Unix epoch when this entry was created.
std::array< uint8_t, 32 > prev_hash
SHA-256 hash of the previous entry (all zeros for the first entry, or a user-supplied continuation ha...
std::array< uint8_t, 32 > data_hash
SHA-256 hash of the record/row data that this entry covers.
A single ML inference event with full operational metadata.
std::string training_dataset_id
Training data identifier.
std::string model_retraining_schedule
Cron or description of retraining schedule (EU AI Act Art.13)
std::string model_id
Model identifier (e.g., "gpt-4", "bert-base")
std::string output_hash
SHA-256 hash of raw output.
int64_t timestamp_ns
Inference timestamp (nanoseconds since epoch)
int64_t model_training_end_ns
Timestamp when model training completed (EU AI Act Art.12)
std::vector< uint8_t > serialize() const
Serialize the record to a deterministic byte sequence.
std::vector< float > input_embedding
Input embedding (optional, may be empty)
int64_t training_dataset_size
Number of samples in training dataset.
int64_t model_training_data_cutoff_ns
Latest data timestamp used in training.
static expected< InferenceRecord > deserialize(const uint8_t *data, size_t size)
Reconstruct an InferenceRecord from its serialized byte representation.
int64_t latency_ns
Inference latency in nanoseconds.
int32_t output_tokens
Output token count (LLM, 0 if N/A)
std::string session_id
Session identifier.
float output_score
Primary output score/probability.
std::string user_id_hash
Hashed user ID (for privacy)
InferenceType inference_type
Type of inference.
std::string metadata_json
Additional JSON metadata.
std::string model_version
Model version hash or checkpoint ID.
std::string training_data_characteristics
Description of training data properties.
int32_t input_tokens
Input token count (LLM, 0 if N/A)
std::string input_hash
SHA-256 hash of raw input (for privacy)
int32_t batch_size
Batch size.
Configuration options for ParquetWriter.
std::vector< thrift::KeyValue > file_metadata
Custom key-value metadata pairs embedded in the Parquet footer.
std::string created_by
Value written into the Parquet footer's "created_by" field.
Parquet KeyValue metadata entry (parquet.thrift field IDs 1-2).
Parquet format enumerations, type traits, and statistics structs.