iceberg表文件结构
iceberg库表层次关系如下图所示:

表目录结构如下图所示:

表空间目录下,数据存放在data子目录,元数据存放在metadata子目录。
案例
一张iceberg表的文件组织结构如下所示:
/data/warehouse/iceberg_test.db/test001/data
/data/warehouse/iceberg_test.db/test001/metadata数据目录(data)
数据目录结构如下所示:
/data/warehouse/iceberg_test.db/test001/data/00000-1-0e9ae40d-250d-4525-b140-3f50e1427941-0-00001.parquet
/data/warehouse/iceberg_test.db/test001/data/00000-3-c40c9d93-45ec-43b0-8046-cbd3d23ec3a1-0-00001.parquet
/data/warehouse/iceberg_test.db/test001/data/00000-5-1a9166f6-2b8a-4f9c-b2a6-a948ed0968f9-0-00001.parquet
/data/warehouse/iceberg_test.db/test001/data/00000-7-2609dde8-9f25-4593-870f-8c9f2ea6c282-0-00001.parquet
/data/warehouse/iceberg_test.db/test001/data/00000-9-610d6cea-c212-47ed-b015-be6704480ee7-0-00001.parquet
/data/warehouse/iceberg_test.db/test001/data/00001-10-610d6cea-c212-47ed-b015-be6704480ee7-0-00001.parquet
/data/warehouse/iceberg_test.db/test001/data/00001-2-0e9ae40d-250d-4525-b140-3f50e1427941-0-00001.parquet
/data/warehouse/iceberg_test.db/test001/data/00001-4-c40c9d93-45ec-43b0-8046-cbd3d23ec3a1-0-00001.parquet
/data/warehouse/iceberg_test.db/test001/data/00001-6-1a9166f6-2b8a-4f9c-b2a6-a948ed0968f9-0-00001.parquet
/data/warehouse/iceberg_test.db/test001/data/00001-8-2609dde8-9f25-4593-870f-8c9f2ea6c282-0-00001.parquet如果设置了分区,则包含分区子目录。第一次隐式分区也有对应的分区子目录。
如果没设置分区,则所有的数据文件都在data目录下。
元数据目录(metadata)
元数据目录如下所示:
# metadata json file
/data/warehouse/iceberg_test.db/test001/metadata/00000-9b04bd23-c3c4-4e05-ab15-720d9a070375.metadata.json
/data/warehouse/iceberg_test.db/test001/metadata/00001-bc385dea-3727-4744-8876-b689c79435d0.metadata.json
/data/warehouse/iceberg_test.db/test001/metadata/00002-5065399d-65dc-4604-a952-7f8427d4cbcd.metadata.json
/data/warehouse/iceberg_test.db/test001/metadata/00003-ed2629ef-5032-4996-9eb4-05836bf5a01e.metadata.json
/data/warehouse/iceberg_test.db/test001/metadata/00004-b2cb2152-d94e-42dd-a7ec-f98f152d8643.metadata.json
/data/warehouse/iceberg_test.db/test001/metadata/00005-040c65a8-c8bc-430b-96b8-a6f14fe01104.metadata.json
# manifest file
/data/warehouse/iceberg_test.db/test001/metadata/63340959-0aeb-4652-b735-e3a75cac1ff6-m0.avro
/data/warehouse/iceberg_test.db/test001/metadata/68108928-2b8f-44c1-9942-9a1e57d24473-m0.avro
/data/warehouse/iceberg_test.db/test001/metadata/a0d388c7-83c4-49a6-9a32-b058e2e4f919-m0.avro
/data/warehouse/iceberg_test.db/test001/metadata/a6339012-ecd8-4bc3-98e4-3bf81a682e62-m0.avro
/data/warehouse/iceberg_test.db/test001/metadata/c7c8aa50-1ab3-4bfc-80fe-d9cf593a3711-m0.avro
# manifest list file
/data/warehouse/iceberg_test.db/test001/metadata/snap-2106022616408784606-1-63340959-0aeb-4652-b735-e3a75cac1ff6.avro
/data/warehouse/iceberg_test.db/test001/metadata/snap-3652109360786887717-1-c7c8aa50-1ab3-4bfc-80fe-d9cf593a3711.avro
/data/warehouse/iceberg_test.db/test001/metadata/snap-3979142577400722791-1-68108928-2b8f-44c1-9942-9a1e57d24473.avro
/data/warehouse/iceberg_test.db/test001/metadata/snap-713236168318586475-1-a6339012-ecd8-4bc3-98e4-3bf81a682e62.avro
/data/warehouse/iceberg_test.db/test001/metadata/snap-7334379460333501439-1-a0d388c7-83c4-49a6-9a32-b058e2e4f919.avrometadata文件
文件名格式为“{五位数字}-{UUID}.metadata.json”
文件内容例如
{
"format-version" : 2,
"table-uuid" : "ea5ee732-3cdf-4b86-9732-49ab4e23600a",
"location" : "hdfs://HDFS10000011/data/warehouse/iceberg_test.db/test001",
"last-sequence-number" : 5,
"last-updated-ms" : 1758803845006,
"last-column-id" : 2,
"current-schema-id" : 0,
"schemas" : [ {
"type" : "struct",
"schema-id" : 0,
"fields" : [ {
"id" : 1,
"name" : "id",
"required" : true,
"type" : "long",
"doc" : "unique id"
}, {
"id" : 2,
"name" : "data",
"required" : false,
"type" : "string"
} ]
} ],
"default-spec-id" : 0,
"partition-specs" : [ {
"spec-id" : 0,
"fields" : [ ]
} ],
"last-partition-id" : 999,
"default-sort-order-id" : 0,
"sort-orders" : [ {
"order-id" : 0,
"fields" : [ ]
} ],
"properties" : {
"owner" : "hadoop",
"write.parquet.compression-codec" : "zstd"
},
"current-snapshot-id" : 2106022616408784606,
"refs" : {
"main" : {
"snapshot-id" : 2106022616408784606,
"type" : "branch"
}
},
"snapshots" : [ {
"sequence-number" : 1,
"snapshot-id" : 7334379460333501439,
"timestamp-ms" : 1758803839578,
"summary" : {
"operation" : "append",
"spark.app.id" : "application_1758599307040_0004",
"added-data-files" : "2",
"added-records" : "2",
"added-files-size" : "1288",
"changed-partition-count" : "1",
"total-records" : "2",
"total-files-size" : "1288",
"total-data-files" : "2",
"total-delete-files" : "0",
"total-position-deletes" : "0",
"total-equality-deletes" : "0",
"engine-version" : "3.5.1",
"app-id" : "application_1758599307040_0004",
"engine-name" : "spark",
"iceberg-version" : "Apache Iceberg 1.8.1"
},
"manifest-list" : "hdfs://HDFS10000011/data/warehouse/iceberg_test.db/test001/metadata/snap-7334379460333501439-1-a0d388c7-83c4-49a6-9a32-b058e2e4f919.avro",
"schema-id" : 0
}, {
"sequence-number" : 2,
"snapshot-id" : 3979142577400722791,
"parent-snapshot-id" : 7334379460333501439,
"timestamp-ms" : 1758803841054,
"summary" : {
"operation" : "append",
"spark.app.id" : "application_1758599307040_0004",
"added-data-files" : "2",
"added-records" : "2",
"added-files-size" : "1302",
"changed-partition-count" : "1",
"total-records" : "4",
"total-files-size" : "2590",
"total-data-files" : "4",
"total-delete-files" : "0",
"total-position-deletes" : "0",
"total-equality-deletes" : "0",
"engine-version" : "3.5.1",
"app-id" : "application_1758599307040_0004",
"engine-name" : "spark",
"iceberg-version" : "Apache Iceberg 1.8.1"
},
"manifest-list" : "hdfs://HDFS10000011/data/warehouse/iceberg_test.db/test001/metadata/snap-3979142577400722791-1-68108928-2b8f-44c1-9942-9a1e57d24473.avro",
"schema-id" : 0
}, {
"sequence-number" : 3,
"snapshot-id" : 3652109360786887717,
"parent-snapshot-id" : 3979142577400722791,
"timestamp-ms" : 1758803842388,
"summary" : {
"operation" : "append",
"spark.app.id" : "application_1758599307040_0004",
"added-data-files" : "2",
"added-records" : "2",
"added-files-size" : "1316",
"changed-partition-count" : "1",
"total-records" : "6",
"total-files-size" : "3906",
"total-data-files" : "6",
"total-delete-files" : "0",
"total-position-deletes" : "0",
"total-equality-deletes" : "0",
"engine-version" : "3.5.1",
"app-id" : "application_1758599307040_0004",
"engine-name" : "spark",
"iceberg-version" : "Apache Iceberg 1.8.1"
},
"manifest-list" : "hdfs://HDFS10000011/data/warehouse/iceberg_test.db/test001/metadata/snap-3652109360786887717-1-c7c8aa50-1ab3-4bfc-80fe-d9cf593a3711.avro",
"schema-id" : 0
}, {
"sequence-number" : 4,
"snapshot-id" : 713236168318586475,
"parent-snapshot-id" : 3652109360786887717,
"timestamp-ms" : 1758803843700,
"summary" : {
"operation" : "append",
"spark.app.id" : "application_1758599307040_0004",
"added-data-files" : "2",
"added-records" : "2",
"added-files-size" : "1330",
"changed-partition-count" : "1",
"total-records" : "8",
"total-files-size" : "5236",
"total-data-files" : "8",
"total-delete-files" : "0",
"total-position-deletes" : "0",
"total-equality-deletes" : "0",
"engine-version" : "3.5.1",
"app-id" : "application_1758599307040_0004",
"engine-name" : "spark",
"iceberg-version" : "Apache Iceberg 1.8.1"
},
"manifest-list" : "hdfs://HDFS10000011/data/warehouse/iceberg_test.db/test001/metadata/snap-713236168318586475-1-a6339012-ecd8-4bc3-98e4-3bf81a682e62.avro",
"schema-id" : 0
}, {
"sequence-number" : 5,
"snapshot-id" : 2106022616408784606,
"parent-snapshot-id" : 713236168318586475,
"timestamp-ms" : 1758803845006,
"summary" : {
"operation" : "append",
"spark.app.id" : "application_1758599307040_0004",
"added-data-files" : "2",
"added-records" : "2",
"added-files-size" : "1344",
"changed-partition-count" : "1",
"total-records" : "10",
"total-files-size" : "6580",
"total-data-files" : "10",
"total-delete-files" : "0",
"total-position-deletes" : "0",
"total-equality-deletes" : "0",
"engine-version" : "3.5.1",
"app-id" : "application_1758599307040_0004",
"engine-name" : "spark",
"iceberg-version" : "Apache Iceberg 1.8.1"
},
"manifest-list" : "hdfs://HDFS10000011/data/warehouse/iceberg_test.db/test001/metadata/snap-2106022616408784606-1-63340959-0aeb-4652-b735-e3a75cac1ff6.avro",
"schema-id" : 0
} ],
"statistics" : [ ],
"partition-statistics" : [ ],
"snapshot-log" : [ {
"timestamp-ms" : 1758803839578,
"snapshot-id" : 7334379460333501439
}, {
"timestamp-ms" : 1758803841054,
"snapshot-id" : 3979142577400722791
}, {
"timestamp-ms" : 1758803842388,
"snapshot-id" : 3652109360786887717
}, {
"timestamp-ms" : 1758803843700,
"snapshot-id" : 713236168318586475
}, {
"timestamp-ms" : 1758803845006,
"snapshot-id" : 2106022616408784606
} ],
"metadata-log" : [ {
"timestamp-ms" : 1758803828850,
"metadata-file" : "hdfs://HDFS10000011/data/warehouse/iceberg_test.db/test001/metadata/00000-9b04bd23-c3c4-4e05-ab15-720d9a070375.metadata.json"
}, {
"timestamp-ms" : 1758803839578,
"metadata-file" : "hdfs://HDFS10000011/data/warehouse/iceberg_test.db/test001/metadata/00001-bc385dea-3727-4744-8876-b689c79435d0.metadata.json"
}, {
"timestamp-ms" : 1758803841054,
"metadata-file" : "hdfs://HDFS10000011/data/warehouse/iceberg_test.db/test001/metadata/00002-5065399d-65dc-4604-a952-7f8427d4cbcd.metadata.json"
}, {
"timestamp-ms" : 1758803842388,
"metadata-file" : "hdfs://HDFS10000011/data/warehouse/iceberg_test.db/test001/metadata/00003-ed2629ef-5032-4996-9eb4-05836bf5a01e.metadata.json"
}, {
"timestamp-ms" : 1758803843700,
"metadata-file" : "hdfs://HDFS10000011/data/warehouse/iceberg_test.db/test001/metadata/00004-b2cb2152-d94e-42dd-a7ec-f98f152d8643.metadata.json"
} ]
}manifest list文件
文件名格式为:snap-{snapshotID}-{attemptID}-{commitUUID}.avro
manifest list文件记录了manifest file和统计信息。
文件内容如下所示:
{
"manifest_path": "hdfs://HDFS10000011/data/warehouse/iceberg_test.db/test001/metadata/a6339012-ecd8-4bc3-98e4-3bf81a682e62-m0.avro",
"manifest_length": 6727,
"partition_spec_id": 0,
"content": 0,
"sequence_number": 4,
"min_sequence_number": 4,
"added_snapshot_id": 713236168318586475,
"added_files_count": 2,
"existing_files_count": 0,
"deleted_files_count": 0,
"added_rows_count": 2,
"existing_rows_count": 0,
"deleted_rows_count": 0,
"partitions": {
"array": [
]
},
"key_metadata": null
}
{
"manifest_path": "hdfs://HDFS10000011/data/warehouse/iceberg_test.db/test001/metadata/c7c8aa50-1ab3-4bfc-80fe-d9cf593a3711-m0.avro",
"manifest_length": 6728,
"partition_spec_id": 0,
"content": 0,
"sequence_number": 3,
"min_sequence_number": 3,
"added_snapshot_id": 3652109360786887717,
"added_files_count": 2,
"existing_files_count": 0,
"deleted_files_count": 0,
"added_rows_count": 2,
"existing_rows_count": 0,
"deleted_rows_count": 0,
"partitions": {
"array": [
]
},
"key_metadata": null
}
{
"manifest_path": "hdfs://HDFS10000011/data/warehouse/iceberg_test.db/test001/metadata/68108928-2b8f-44c1-9942-9a1e57d24473-m0.avro",
"manifest_length": 6726,
"partition_spec_id": 0,
"content": 0,
"sequence_number": 2,
"min_sequence_number": 2,
"added_snapshot_id": 3979142577400722791,
"added_files_count": 2,
"existing_files_count": 0,
"deleted_files_count": 0,
"added_rows_count": 2,
"existing_rows_count": 0,
"deleted_rows_count": 0,
"partitions": {
"array": [
]
},
"key_metadata": null
}
{
"manifest_path": "hdfs://HDFS10000011/data/warehouse/iceberg_test.db/test001/metadata/a0d388c7-83c4-49a6-9a32-b058e2e4f919-m0.avro",
"manifest_length": 6722,
"partition_spec_id": 0,
"content": 0,
"sequence_number": 1,
"min_sequence_number": 1,
"added_snapshot_id": 7334379460333501439,
"added_files_count": 2,
"existing_files_count": 0,
"deleted_files_count": 0,
"added_rows_count": 2,
"existing_rows_count": 0,
"deleted_rows_count": 0,
"partitions": {
"array": [
]
},
"key_metadata": null
}通过以下命令可查看一个avro文件
java -jar avro-tools-1.12.0.jar tojson snap-713236168318586475-1-a6339012-ecd8-4bc3-98e4-3bf81a682e62.avro输出结果为一行一行的数据,每一行都是一个json字符串,上述案例把json字符串展开了。
manifest file文件
文件名格式为:{commitUUID}-m{manifestCount}.avro
manifest file记录了data文件和统计信息。
文件内容如下所示:
{
"status": 1,
"snapshot_id": {
"long": 713236168318586475
},
"sequence_number": null,
"file_sequence_number": null,
"data_file": {
"content": 0,
"file_path": "hdfs://HDFS10000011/data/warehouse/iceberg_test.db/test001/data/00000-7-2609dde8-9f25-4593-870f-8c9f2ea6c282-0-00001.parquet",
"file_format": "PARQUET",
"partition": {
},
"record_count": 1,
"file_size_in_bytes": 665,
"column_sizes": {
"array": [
{
"key": 1,
"value": 40
},
{
"key": 2,
"value": 40
}
]
},
"value_counts": {
"array": [
{
"key": 1,
"value": 1
},
{
"key": 2,
"value": 1
}
]
},
"null_value_counts": {
"array": [
{
"key": 1,
"value": 0
},
{
"key": 2,
"value": 0
}
]
},
"nan_value_counts": {
"array": [
]
},
"lower_bounds": {
"array": [
{
"key": 1,
"value": "W\u0004\u0000\u0000\u0000\u0000\u0000\u0000"
},
{
"key": 2,
"value": "gggg"
}
]
},
"upper_bounds": {
"array": [
{
"key": 1,
"value": "W\u0004\u0000\u0000\u0000\u0000\u0000\u0000"
},
{
"key": 2,
"value": "gggg"
}
]
},
"key_metadata": null,
"split_offsets": {
"array": [
4
]
},
"equality_ids": null,
"sort_order_id": {
"int": 0
}
}
}
{
"status": 1,
"snapshot_id": {
"long": 713236168318586475
},
"sequence_number": null,
"file_sequence_number": null,
"data_file": {
"content": 0,
"file_path": "hdfs://HDFS10000011/data/warehouse/iceberg_test.db/test001/data/00001-8-2609dde8-9f25-4593-870f-8c9f2ea6c282-0-00001.parquet",
"file_format": "PARQUET",
"partition": {
},
"record_count": 1,
"file_size_in_bytes": 665,
"column_sizes": {
"array": [
{
"key": 1,
"value": 40
},
{
"key": 2,
"value": 40
}
]
},
"value_counts": {
"array": [
{
"key": 1,
"value": 1
},
{
"key": 2,
"value": 1
}
]
},
"null_value_counts": {
"array": [
{
"key": 1,
"value": 0
},
{
"key": 2,
"value": 0
}
]
},
"nan_value_counts": {
"array": [
]
},
"lower_bounds": {
"array": [
{
"key": 1,
"value": "X\u0004\u0000\u0000\u0000\u0000\u0000\u0000"
},
{
"key": 2,
"value": "hhhh"
}
]
},
"upper_bounds": {
"array": [
{
"key": 1,
"value": "X\u0004\u0000\u0000\u0000\u0000\u0000\u0000"
},
{
"key": 2,
"value": "hhhh"
}
]
},
"key_metadata": null,
"split_offsets": {
"array": [
4
]
},
"equality_ids": null,
"sort_order_id": {
"int": 0
}
}
}通过以下命令读取avro信息
java -jar avro-tools-1.12.0.jar tojson a6339012-ecd8-4bc3-98e4-3bf81a682e62-m0.avro