15#include <emscripten.h>
16#include <emscripten/bind.h>
17#include <emscripten/val.h>
25namespace em = emscripten;
34static constexpr unsigned MEMFS_MAX_FILE_SIZE = 256u * 1024u * 1024u;
44static bool writeFileToMemfs(
const std::string& path,
const em::val& arrayBuffer) {
45 auto view = em::val::global(
"Uint8Array").new_(arrayBuffer);
46 auto len = view[
"length"].as<
unsigned>();
47 if (len > MEMFS_MAX_FILE_SIZE) {
48 emscripten_log(EM_LOG_ERROR,
"writeFileToMemfs: file too large (%u bytes, max %u)", len, MEMFS_MAX_FILE_SIZE);
51 std::vector<uint8_t> buf(len);
52 auto memView = em::val(em::typed_memory_view(len, buf.data()));
53 memView.call<
void>(
"set", view);
55 FS.writeFile(UTF8ToString($0), HEAPU8.subarray($1, $1 + $2));
56 }, path.c_str(), buf.data(),
static_cast<int>(len));
66static em::val readFileFromMemfs(
const std::string& path) {
67 em::val fs = em::val::module_property(
"FS");
68 return fs.call<em::val>(
"readFile", path);
80static std::vector<uint8_t> hexToBytes(
const std::string& hex) {
81 std::vector<uint8_t> bytes;
82 if (hex.size() % 2 != 0)
return bytes;
83 bytes.reserve(hex.size() / 2);
84 for (
size_t i = 0; i < hex.size(); i += 2) {
85 auto hi = hex[i], lo = hex[i + 1];
86 auto nibble = [](
char c) ->
int {
87 if (c >=
'0' && c <=
'9')
return c -
'0';
88 if (c >=
'a' && c <=
'f')
return 10 + c -
'a';
89 if (c >=
'A' && c <=
'F')
return 10 + c -
'A';
92 int h = nibble(hi), l = nibble(lo);
93 if (h < 0 || l < 0)
return {};
94 bytes.push_back(
static_cast<uint8_t
>((h << 4) | l));
107static std::vector<std::pair<std::string, std::string>>
108parseColumnKeys(
const std::string& json) {
109 std::vector<std::pair<std::string, std::string>> result;
110 size_t i = json.find(
'{');
111 if (i == std::string::npos)
return result;
113 auto skipWs = [&]() {
while (i < json.size() && json[i] <=
' ') ++i; };
114 auto readStr = [&]() -> std::string {
116 if (i >= json.size() || json[i] !=
'"')
return {};
119 while (i < json.size() && json[i] !=
'"') {
120 if (json[i] ==
'\\' && i + 1 < json.size()) {
125 if (i < json.size()) ++i;
128 while (i < json.size()) {
130 if (json[i] ==
'}')
break;
131 auto key = readStr();
133 if (i < json.size() && json[i] ==
':') ++i;
134 auto val = readStr();
135 if (!key.empty()) result.emplace_back(std::move(key), std::move(val));
137 if (i < json.size() && json[i] ==
',') ++i;
147static std::string version() {
197static std::string schemaColumnName(
const Schema& s,
size_t i) {
203static int schemaColumnPhysicalType(
const Schema& s,
size_t i) {
209static std::string schemaName(
const Schema& s) {
return s.
name(); }
215static std::string physicalTypeName(
int pt) {
217 case PhysicalType::BOOLEAN:
return "BOOLEAN";
218 case PhysicalType::INT32:
return "INT32";
219 case PhysicalType::INT64:
return "INT64";
220 case PhysicalType::INT96:
return "INT96";
221 case PhysicalType::FLOAT:
return "FLOAT";
222 case PhysicalType::DOUBLE:
return "DOUBLE";
223 case PhysicalType::BYTE_ARRAY:
return "BYTE_ARRAY";
224 case PhysicalType::FIXED_LEN_BYTE_ARRAY:
return "FIXED_LEN_BYTE_ARRAY";
225 default:
return "UNKNOWN";
233static std::string logicalTypeName(
int lt) {
235 case LogicalType::NONE:
return "NONE";
236 case LogicalType::STRING:
return "STRING";
237 case LogicalType::ENUM:
return "ENUM";
238 case LogicalType::UUID:
return "UUID";
239 case LogicalType::DATE:
return "DATE";
240 case LogicalType::TIME_MS:
return "TIME_MS";
241 case LogicalType::TIME_US:
return "TIME_US";
242 case LogicalType::TIME_NS:
return "TIME_NS";
243 case LogicalType::TIMESTAMP_MS:
return "TIMESTAMP_MS";
244 case LogicalType::TIMESTAMP_US:
return "TIMESTAMP_US";
245 case LogicalType::TIMESTAMP_NS:
return "TIMESTAMP_NS";
246 case LogicalType::DECIMAL:
return "DECIMAL";
247 case LogicalType::JSON:
return "JSON";
248 case LogicalType::BSON:
return "BSON";
249 case LogicalType::FLOAT16:
return "FLOAT16";
250 case LogicalType::FLOAT32_VECTOR:
return "FLOAT32_VECTOR";
251 default:
return "UNKNOWN";
256static int schemaColumnLogicalType(
const Schema& s,
size_t i) {
291 std::unique_ptr<ParquetWriter> writer_;
303 if (!result.has_value())
return false;
304 writer_ = std::make_unique<ParquetWriter>(std::move(*result));
318 if (!writer_ || col >= writer_->num_columns())
return false;
319 auto len = arr[
"length"].as<
unsigned>();
320 std::vector<bool> buf(len);
321 for (
unsigned i = 0; i < len; ++i) buf[i] = arr[i].as<bool>();
323 std::vector<uint8_t> raw(len);
324 for (
unsigned i = 0; i < len; ++i) raw[i] = buf[i] ? 1 : 0;
325 return writer_->write_column<
bool>(col,
reinterpret_cast<const bool*
>(raw.data()), len).has_value();
330 if (!writer_ || col >= writer_->num_columns())
return false;
331 auto len = arr[
"length"].as<
unsigned>();
332 std::vector<int32_t> buf(len);
333 for (
unsigned i = 0; i < len; ++i) buf[i] = arr[i].as<int32_t>();
334 return writer_->write_column<int32_t>(col, buf.data(), len).has_value();
339 if (!writer_ || col >= writer_->num_columns())
return false;
340 auto len = arr[
"length"].as<
unsigned>();
341 std::vector<int64_t> buf(len);
342 for (
unsigned i = 0; i < len; ++i) buf[i] = arr[i].as<int64_t>();
343 return writer_->write_column<int64_t>(col, buf.data(), len).has_value();
348 if (!writer_ || col >= writer_->num_columns())
return false;
349 auto len = arr[
"length"].as<
unsigned>();
350 std::vector<float> buf(len);
351 for (
unsigned i = 0; i < len; ++i) buf[i] = arr[i].as<float>();
352 return writer_->write_column<
float>(col, buf.data(), len).has_value();
357 if (!writer_ || col >= writer_->num_columns())
return false;
358 auto len = arr[
"length"].as<
unsigned>();
359 std::vector<double> buf(len);
360 for (
unsigned i = 0; i < len; ++i) buf[i] = arr[i].as<double>();
361 return writer_->write_column<
double>(col, buf.data(), len).has_value();
366 if (!writer_ || col >= writer_->num_columns())
return false;
367 auto len = arr[
"length"].as<
unsigned>();
368 std::vector<std::string> buf(len);
369 for (
unsigned i = 0; i < len; ++i) buf[i] = arr[i].as<std::string>();
370 return writer_->write_column<std::string>(col, buf.data(), len).has_value();
378 if (!writer_)
return false;
379 return writer_->flush_row_group().has_value();
385 if (!writer_)
return false;
386 return writer_->close().has_value();
391 return writer_ ? writer_->rows_written() : 0;
396 return writer_ && writer_->is_open();
410 std::unique_ptr<ParquetReader> reader_;
418 bool open(
const std::string& path) {
420 if (!result.has_value())
return false;
421 reader_ = std::make_unique<ParquetReader>(std::move(*result));
425#if SIGNET_ENABLE_COMMERCIAL
442 bool openEncrypted(
const std::string& path,
443 const std::string& footerKeyHex,
444 const std::string& columnKeyHex,
445 const std::string& aadPrefix,
446 const std::string& columnKeysJson) {
449 if (cfg.
footer_key.size() != 32)
return false;
451 if (!columnKeyHex.empty()) {
456 if (!aadPrefix.empty()) {
460 if (!columnKeysJson.empty()) {
461 auto keys = parseColumnKeys(columnKeysJson);
462 for (
auto& [name, hexKey] : keys) {
465 spec.
key = hexToBytes(hexKey);
466 if (spec.
key.size() != 32)
return false;
474 auto zero_vec = [](std::vector<uint8_t>& v) {
476 volatile uint8_t* p = v.data();
477 for (
size_t i = 0; i < v.size(); ++i) p[i] = 0;
483 for (
auto& ck : cfg.column_keys) zero_vec(ck.key);
485 if (!result.has_value())
return false;
486 reader_ = std::make_unique<ParquetReader>(std::move(*result));
493 return reader_ ? reader_->num_rows() : 0;
498 return reader_ ? reader_->num_row_groups() : 0;
503 if (!reader_)
return Schema{};
504 return reader_->schema();
509 return reader_ ? reader_->created_by() :
"";
522 if (!reader_)
return em::val::array();
523 auto result = reader_->read_column<
bool>(rg, col);
524 if (!result.has_value())
return em::val::array();
525 auto arr = em::val::array();
526 for (
size_t i = 0; i < result->size(); ++i)
527 arr.call<
void>(
"push", (*result)[i]);
533 if (!reader_)
return em::val::array();
534 auto result = reader_->read_column<int32_t>(rg, col);
535 if (!result.has_value())
return em::val::array();
536 auto arr = em::val::array();
537 for (
size_t i = 0; i < result->size(); ++i)
538 arr.call<
void>(
"push", (*result)[i]);
544 if (!reader_)
return em::val::array();
545 auto result = reader_->read_column<int64_t>(rg, col);
546 if (!result.has_value())
return em::val::array();
547 auto arr = em::val::array();
548 for (
size_t i = 0; i < result->size(); ++i)
549 arr.call<
void>(
"push",
static_cast<double>((*result)[i]));
555 if (!reader_)
return em::val::array();
556 auto result = reader_->read_column<
float>(rg, col);
557 if (!result.has_value())
return em::val::array();
558 auto arr = em::val::array();
559 for (
size_t i = 0; i < result->size(); ++i)
560 arr.call<
void>(
"push", (*result)[i]);
566 if (!reader_)
return em::val::array();
567 auto result = reader_->read_column<
double>(rg, col);
568 if (!result.has_value())
return em::val::array();
569 auto arr = em::val::array();
570 for (
size_t i = 0; i < result->size(); ++i)
571 arr.call<
void>(
"push", (*result)[i]);
577 if (!reader_)
return em::val::array();
578 auto result = reader_->read_column<std::string>(rg, col);
579 if (!result.has_value())
return em::val::array();
580 auto arr = em::val::array();
581 for (
size_t i = 0; i < result->size(); ++i)
582 arr.call<
void>(
"push", (*result)[i]);
596 if (!reader_)
return em::val::array();
597 auto result = reader_->read_column_as_strings(rg, col);
598 if (!result.has_value())
return em::val::array();
599 auto arr = em::val::array();
600 for (
size_t i = 0; i < result->size(); ++i)
601 arr.call<
void>(
"push", (*result)[i]);
619 em::function(
"version", &version);
620 em::function(
"physicalTypeName", &physicalTypeName);
621 em::function(
"logicalTypeName", &logicalTypeName);
622 em::function(
"writeFileToMemfs", &writeFileToMemfs);
623 em::function(
"readFileFromMemfs", &readFileFromMemfs);
626 em::class_<Schema>(
"Schema")
628 .function(
"numColumns", &schemaNumColumns)
629 .function(
"columnName", &schemaColumnName)
630 .function(
"columnPhysicalType", &schemaColumnPhysicalType)
631 .function(
"columnLogicalType", &schemaColumnLogicalType)
632 .function(
"name", &schemaName)
636 em::class_<WasmSchemaBuilder>(
"SchemaBuilder")
637 .constructor<std::string>()
648 em::class_<WasmWriterOptions>(
"WriterOptions")
655 em::class_<WasmParquetWriter>(
"ParquetWriter")
671 em::class_<WasmParquetReader>(
"ParquetReader")
674#if SIGNET_ENABLE_COMMERCIAL
675 .function(
"openEncrypted", &WasmParquetReader::openEncrypted)
JavaScript-facing Parquet reader.
em::val readColumnInt32(size_t rg, size_t col)
Read an int32 column as a JS Array of numbers.
em::val readColumnFloat(size_t rg, size_t col)
Read a float column as a JS Array of numbers.
em::val readColumnBool(size_t rg, size_t col)
Read a boolean column as a JS Array of booleans.
bool open(const std::string &path)
Open a plaintext Parquet file from MEMFS.
em::val readColumnInt64(size_t rg, size_t col)
Read an int64 column as a JS Array of doubles (JS has no native int64).
int64_t numRowGroups() const
Return the number of row groups in the file, or 0 if not open.
em::val readColumnString(size_t rg, size_t col)
Read a string (BYTE_ARRAY) column as a JS Array of strings.
WasmParquetReader()=default
Default-construct in an unopened state.
em::val readColumnAsStrings(size_t rg, size_t col)
Read any column as a JS Array of strings (type-erased).
em::val readColumnDouble(size_t rg, size_t col)
Read a double column as a JS Array of numbers.
Schema schema() const
Return the file's schema, or an empty Schema if not open.
int64_t numRows() const
Return total row count across all row groups, or 0 if not open.
std::string createdBy() const
Return the "created by" metadata string, or "" if not open.
JavaScript-facing Parquet writer.
bool flushRowGroup()
Flush the current row group to disk and begin a new one.
bool writeColumnInt32(size_t col, const em::val &arr)
Write an int32 column from a JS array.
bool writeColumnBool(size_t col, const em::val &arr)
Write a boolean column from a JS array.
bool writeColumnString(size_t col, const em::val &arr)
Write a string (BYTE_ARRAY) column from a JS array.
WasmParquetWriter()=default
Default-construct in an unopened state.
bool writeColumnInt64(size_t col, const em::val &arr)
Write an int64 column from a JS array.
bool writeColumnDouble(size_t col, const em::val &arr)
Write a double column from a JS array.
int64_t rowsWritten() const
Return the total number of rows written so far (across all row groups).
bool close()
Finalize the Parquet file (writes footer metadata and closes the file).
bool isOpen() const
Check whether the writer is currently open and accepting data.
bool writeColumnFloat(size_t col, const em::val &arr)
Write a float column from a JS array.
bool open(const std::string &path, const Schema &schema, const WasmWriterOptions &opts)
Open a new Parquet file for writing on MEMFS.
Fluent Parquet schema builder exposed to JavaScript.
WasmSchemaBuilder & addInt64(const std::string &col)
WasmSchemaBuilder & addBool(const std::string &col)
Schema build()
Finalize and return the immutable Schema object.
WasmSchemaBuilder & addInt32(const std::string &col)
WasmSchemaBuilder & addDouble(const std::string &col)
WasmSchemaBuilder(const std::string &name)
Construct a new schema builder.
WasmSchemaBuilder & addString(const std::string &col)
WasmSchemaBuilder & addFloat(const std::string &col)
Thin wrapper around core WriterOptions for JavaScript consumption.
WriterOptions opts
Underlying writer options struct.
int64_t getRowGroupSize() const
Get the current row group size setting.
WasmWriterOptions()=default
Construct with default options.
void setRowGroupSize(int64_t n)
Set the target row group size (number of rows per group).
static expected< ParquetReader > open(const std::filesystem::path &path)
Open and parse a Parquet file, returning a ready-to-query reader.
static expected< ParquetWriter > open(const std::filesystem::path &path, const Schema &schema, const Options &options=Options{})
Open a new Parquet file for writing.
Fluent builder for constructing a Schema one column at a time.
SchemaBuilder & column(std::string col_name, LogicalType logical_type=LogicalType::NONE)
Add a typed column, deducing PhysicalType from T.
Schema build()
Build the final Schema, consuming the builder.
Immutable schema description for a Parquet file.
size_t num_columns() const
Number of columns in this schema.
const std::string & name() const
Root schema name (e.g. "tick_data").
const ColumnDescriptor & column(size_t index) const
Access a column descriptor by index.
Single-include umbrella header for the Signet Forge library.
constexpr const char * SIGNET_CREATED_BY
Default "created_by" string embedded in every Parquet footer.
PhysicalType
Parquet physical (storage) types as defined in parquet.thrift.
LogicalType
Parquet logical types (from parquet.thrift LogicalType union).
EMSCRIPTEN_BINDINGS(signet_forge)
Emscripten embind registration block.
LogicalType logical_type
Semantic annotation (STRING, TIMESTAMP_NS, etc.).
std::string name
Column name (unique within a schema).
PhysicalType physical_type
On-disk storage type.
Configuration options for ParquetWriter.
int64_t row_group_size
Target number of rows per row group.
Specifies the encryption key for a single Parquet column.
std::string column_name
Parquet column path (e.g. "a.b.c").
std::vector< uint8_t > key
32-byte AES-256 key (INTERNAL mode).
Top-level configuration structure that drives FileEncryptor / FileDecryptor.
std::vector< uint8_t > default_column_key
Default column key (32 bytes).
std::vector< uint8_t > footer_key
32-byte AES-256 key for encrypting the Parquet footer (FileMetaData).
std::string aad_prefix
AAD prefix – typically a file identifier or URI.
std::vector< ColumnKeySpec > column_keys
Per-column key specifications. Columns listed here get their own key.