Iceberg是數據湖熱門組件之一,本系列文章將深入探究一二队询。
首先將研究iceberg底層存儲慰技。
1、啟動本地的Spark
./bin/spark-sql \
--packages org.apache.iceberg:iceberg-spark3-runtime:0.12.1 \
--conf spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions \
--conf spark.sql.catalog.spark_catalog=org.apache.iceberg.spark.SparkSessionCatalog \
--conf spark.sql.catalog.spark_catalog.type=hive \
--conf spark.sql.catalog.local=org.apache.iceberg.spark.SparkCatalog \
--conf spark.sql.catalog.local.type=hadoop \
--conf spark.sql.catalog.local.warehouse=$PWD/warehouse
分別使用v1 v2兩種格式創(chuàng)建表
使用format-version 1創(chuàng)建表table
CREATE TABLE local.db.table (id bigint, data string) USING iceberg;
打開目錄徘层,其結構如下:
(base) ? table ll -R
total 0
drwxr-xr-x 6 liliwei staff 192B Jan 2 21:22 metadata
./metadata:
total 16
-rw-r--r--@ 1 liliwei staff 1.2K Jan 2 21:22 v1.metadata.json
-rw-r--r--@ 1 liliwei staff 1B Jan 2 21:22 version-hint.text
(base) ? table
查看v1.metadata.json峻呕,內容如下:
{
"format-version" : 1,
"table-uuid" : "0dc08d49-ed4d-49bb-8ddf-006e37c65372",
"location" : "/Users/liliwei/plat/spark-3.1.2-bin-hadoop3.2/warehouse/db/table",
"last-updated-ms" : 1641129739691,
"last-column-id" : 2,
"schema" : {
"type" : "struct",
"schema-id" : 0,
"fields" : [ {
"id" : 1,
"name" : "id",
"required" : false,
"type" : "long"
}, {
"id" : 2,
"name" : "data",
"required" : false,
"type" : "string"
} ]
},
"current-schema-id" : 0,
"schemas" : [ {
"type" : "struct",
"schema-id" : 0,
"fields" : [ {
"id" : 1,
"name" : "id",
"required" : false,
"type" : "long"
}, {
"id" : 2,
"name" : "data",
"required" : false,
"type" : "string"
} ]
} ],
"partition-spec" : [ ],
"default-spec-id" : 0,
"partition-specs" : [ {
"spec-id" : 0,
"fields" : [ ]
} ],
"last-partition-id" : 999,
"default-sort-order-id" : 0,
"sort-orders" : [ {
"order-id" : 0,
"fields" : [ ]
} ],
"properties" : {
"owner" : "liliwei"
},
"current-snapshot-id" : -1,
"snapshots" : [ ],
"snapshot-log" : [ ],
"metadata-log" : [ ]
}
查看version-hint.text,內容如下:
1
使用format-version 2創(chuàng)建表tableV2
CREATE TABLE local.db.tableV2 (id bigint, data string)
USING iceberg
TBLPROPERTIES ('format-version'='2');
tavleV2的目錄結構如下:
(base) ? tableV2 cd metadata
(base) ? metadata ll
total 16
-rw-r--r-- 1 liliwei staff 936B Jan 2 21:38 v1.metadata.json
-rw-r--r-- 1 liliwei staff 1B Jan 2 21:38 version-hint.text
(base) ? metadata
v1.metadata.json的內容如下:
{
"format-version" : 2,
"table-uuid" : "67b54789-070c-4600-b2ff-3b9a0a774e4a",
"location" : "/Users/liliwei/plat/spark-3.1.2-bin-hadoop3.2/warehouse/db/tableV2",
"last-sequence-number" : 0,
"last-updated-ms" : 1641130714999,
"last-column-id" : 2,
"current-schema-id" : 0,
"schemas" : [ {
"type" : "struct",
"schema-id" : 0,
"fields" : [ {
"id" : 1,
"name" : "id",
"required" : false,
"type" : "long"
}, {
"id" : 2,
"name" : "data",
"required" : false,
"type" : "string"
} ]
} ],
"default-spec-id" : 0,
"partition-specs" : [ {
"spec-id" : 0,
"fields" : [ ]
} ],
"last-partition-id" : 999,
"default-sort-order-id" : 0,
"sort-orders" : [ {
"order-id" : 0,
"fields" : [ ]
} ],
"properties" : {
"owner" : "liliwei"
},
"current-snapshot-id" : -1,
"snapshots" : [ ],
"snapshot-log" : [ ],
"metadata-log" : [ ]
}
version-hint.text的內容如下:
1
現在趣效,我們插入數據到表中
INSERT INTO local.db.table VALUES (1, 'a');
再次查看目錄結構:
(base) ? table tree -A -C -D
.
├── [Jan 2 21:45] data
│ └── [Jan 2 21:45] 00000-0-ea35130e-b5ed-4443-889f-2ee5e62e6757-00001.parquet
└── [Jan 2 21:45] metadata
├── [Jan 2 21:45] 1bd1f809-55ea-4ba1-b425-ab4ecc212434-m0.avro
├── [Jan 2 21:45] snap-5028042644139258397-1-1bd1f809-55ea-4ba1-b425-ab4ecc212434.avro
├── [Jan 2 21:22] v1.metadata.json
├── [Jan 2 21:45] v2.metadata.json
└── [Jan 2 21:45] version-hint.text
2 directories, 6 files
查看v2.metadata.json文件內容:
{
"format-version" : 1,
"table-uuid" : "0dc08d49-ed4d-49bb-8ddf-006e37c65372",
"location" : "/Users/liliwei/plat/spark-3.1.2-bin-hadoop3.2/warehouse/db/table",
"last-updated-ms" : 1641131156558,
"last-column-id" : 2,
"schema" : {
"type" : "struct",
"schema-id" : 0,
"fields" : [ {
"id" : 1,
"name" : "id",
"required" : false,
"type" : "long"
}, {
"id" : 2,
"name" : "data",
"required" : false,
"type" : "string"
} ]
},
"current-schema-id" : 0,
"schemas" : [ {
"type" : "struct",
"schema-id" : 0,
"fields" : [ {
"id" : 1,
"name" : "id",
"required" : false,
"type" : "long"
}, {
"id" : 2,
"name" : "data",
"required" : false,
"type" : "string"
} ]
} ],
"partition-spec" : [ ],
"default-spec-id" : 0,
"partition-specs" : [ {
"spec-id" : 0,
"fields" : [ ]
} ],
"last-partition-id" : 999,
"default-sort-order-id" : 0,
"sort-orders" : [ {
"order-id" : 0,
"fields" : [ ]
} ],
"properties" : {
"owner" : "liliwei"
},
"current-snapshot-id" : 5028042644139258397,
"snapshots" : [ {
"snapshot-id" : 5028042644139258397,
"timestamp-ms" : 1641131156558,
"summary" : {
"operation" : "append",
"spark.app.id" : "local-1641129606166",
"added-data-files" : "1",
"added-records" : "1",
"added-files-size" : "643",
"changed-partition-count" : "1",
"total-records" : "1",
"total-files-size" : "643",
"total-data-files" : "1",
"total-delete-files" : "0",
"total-position-deletes" : "0",
"total-equality-deletes" : "0"
},
"manifest-list" : "/Users/liliwei/plat/spark-3.1.2-bin-hadoop3.2/warehouse/db/table/metadata/snap-5028042644139258397-1-1bd1f809-55ea-4ba1-b425-ab4ecc212434.avro",
"schema-id" : 0
} ],
"snapshot-log" : [ {
"timestamp-ms" : 1641131156558,
"snapshot-id" : 5028042644139258397
} ],
"metadata-log" : [ {
"timestamp-ms" : 1641129739691,
"metadata-file" : "/Users/liliwei/plat/spark-3.1.2-bin-hadoop3.2/warehouse/db/table/metadata/v1.metadata.json"
} ]
}
version-hint.text內容:
2
snap-5028042644139258397-1-1bd1f809-55ea-4ba1-b425-ab4ecc212434.avro內容:
此處借助于avro轉json的工具:avro-tools-1.10.2.jar (https://repo1.maven.org/maven2/org/apache/avro/avro-tools/1.10.2/)
java -jar ~/plat/tools/avro-tools-1.10.2.jar tojson metadata/snap-5028042644139258397-1-1bd1f809-55ea-4ba1-b425-ab4ecc212434.avro
{
"manifest_path": "/Users/liliwei/plat/spark-3.1.2-bin-hadoop3.2/warehouse/db/table/metadata/1bd1f809-55ea-4ba1-b425-ab4ecc212434-m0.avro",
"manifest_length": 5803,
"partition_spec_id": 0,
"added_snapshot_id": {
"long": 5028042644139258397
},
"added_data_files_count": {
"int": 1
},
"existing_data_files_count": {
"int": 0
},
"deleted_data_files_count": {
"int": 0
},
"partitions": {
"array": []
},
"added_rows_count": {
"long": 1
},
"existing_rows_count": {
"long": 0
},
"deleted_rows_count": {
"long": 0
}
}
查看對應的 manifest 文件
(base) ? table java -jar ~/plat/tools/avro-tools-1.10.2.jar tojson metadata/1bd1f809-55ea-4ba1-b425-ab4ecc212434-m0.avro
{
"status": 1,
"snapshot_id": {
"long": 5028042644139258397
},
"data_file": {
"file_path": "/Users/liliwei/plat/spark-3.1.2-bin-hadoop3.2/warehouse/db/table/data/00000-0-ea35130e-b5ed-4443-889f-2ee5e62e6757-00001.parquet",
"file_format": "PARQUET",
"partition": {},
"record_count": 1,
"file_size_in_bytes": 643,
"block_size_in_bytes": 67108864,
"column_sizes": {
"array": [{
"key": 1,
"value": 46
}, {
"key": 2,
"value": 48
}]
},
"value_counts": {
"array": [{
"key": 1,
"value": 1
}, {
"key": 2,
"value": 1
}]
},
"null_value_counts": {
"array": [{
"key": 1,
"value": 0
}, {
"key": 2,
"value": 0
}]
},
"nan_value_counts": {
"array": []
},
"lower_bounds": {
"array": [{
"key": 1,
"value": "\u0001\u0000\u0000\u0000\u0000\u0000\u0000\u0000"
}, {
"key": 2,
"value": "a"
}]
},
"upper_bounds": {
"array": [{
"key": 1,
"value": "\u0001\u0000\u0000\u0000\u0000\u0000\u0000\u0000"
}, {
"key": 2,
"value": "a"
}]
},
"key_metadata": null,
"split_offsets": {
"array": [4]
},
"sort_order_id": {
"int": 0
}
}
}
很明顯瘦癌,其中包含了數據文件的具體位置,以及一些統(tǒng)計信息跷敬。我們再插入一條數據讯私,看看會有什么變化:
INSERT INTO local.db.table VALUES (2, 'b');
查看目錄結構:
(base) ? table tree -C
(base) ? table tree -C -D
.
├── [Jan 2 22:07] data
│ ├── [Jan 2 21:45] 00000-0-ea35130e-b5ed-4443-889f-2ee5e62e6757-00001.parquet
│ └── [Jan 2 22:07] 00000-1-631cd5bc-2ad0-4ddd-9530-f055b2888d56-00001.parquet
└── [Jan 2 22:07] metadata
├── [Jan 2 21:45] 1bd1f809-55ea-4ba1-b425-ab4ecc212434-m0.avro
├── [Jan 2 22:07] 6881af48-5efa-4660-99ed-be5b9f640e52-m0.avro
├── [Jan 2 22:07] snap-1270004071302473053-1-6881af48-5efa-4660-99ed-be5b9f640e52.avro
├── [Jan 2 21:45] snap-5028042644139258397-1-1bd1f809-55ea-4ba1-b425-ab4ecc212434.avro
├── [Jan 2 21:22] v1.metadata.json
├── [Jan 2 21:45] v2.metadata.json
├── [Jan 2 22:07] v3.metadata.json
└── [Jan 2 22:07] version-hint.text
2 directories, 10 files
v3.metadata.json文件內容如下:
{
"format-version" : 1,
"table-uuid" : "0dc08d49-ed4d-49bb-8ddf-006e37c65372",
"location" : "/Users/liliwei/plat/spark-3.1.2-bin-hadoop3.2/warehouse/db/table",
"last-updated-ms" : 1641132476394,
"last-column-id" : 2,
"schema" : {
"type" : "struct",
"schema-id" : 0,
"fields" : [ {
"id" : 1,
"name" : "id",
"required" : false,
"type" : "long"
}, {
"id" : 2,
"name" : "data",
"required" : false,
"type" : "string"
} ]
},
"current-schema-id" : 0,
"schemas" : [ {
"type" : "struct",
"schema-id" : 0,
"fields" : [ {
"id" : 1,
"name" : "id",
"required" : false,
"type" : "long"
}, {
"id" : 2,
"name" : "data",
"required" : false,
"type" : "string"
} ]
} ],
"partition-spec" : [ ],
"default-spec-id" : 0,
"partition-specs" : [ {
"spec-id" : 0,
"fields" : [ ]
} ],
"last-partition-id" : 999,
"default-sort-order-id" : 0,
"sort-orders" : [ {
"order-id" : 0,
"fields" : [ ]
} ],
"properties" : {
"owner" : "liliwei"
},
"current-snapshot-id" : 1270004071302473053,
"snapshots" : [ {
"snapshot-id" : 5028042644139258397,
"timestamp-ms" : 1641131156558,
"summary" : {
"operation" : "append",
"spark.app.id" : "local-1641129606166",
"added-data-files" : "1",
"added-records" : "1",
"added-files-size" : "643",
"changed-partition-count" : "1",
"total-records" : "1",
"total-files-size" : "643",
"total-data-files" : "1",
"total-delete-files" : "0",
"total-position-deletes" : "0",
"total-equality-deletes" : "0"
},
"manifest-list" : "/Users/liliwei/plat/spark-3.1.2-bin-hadoop3.2/warehouse/db/table/metadata/snap-5028042644139258397-1-1bd1f809-55ea-4ba1-b425-ab4ecc212434.avro",
"schema-id" : 0
}, {
"snapshot-id" : 1270004071302473053,
"parent-snapshot-id" : 5028042644139258397,
"timestamp-ms" : 1641132476394,
"summary" : {
"operation" : "append",
"spark.app.id" : "local-1641129606166",
"added-data-files" : "1",
"added-records" : "1",
"added-files-size" : "643",
"changed-partition-count" : "1",
"total-records" : "2",
"total-files-size" : "1286",
"total-data-files" : "2",
"total-delete-files" : "0",
"total-position-deletes" : "0",
"total-equality-deletes" : "0"
},
"manifest-list" : "/Users/liliwei/plat/spark-3.1.2-bin-hadoop3.2/warehouse/db/table/metadata/snap-1270004071302473053-1-6881af48-5efa-4660-99ed-be5b9f640e52.avro",
"schema-id" : 0
} ],
"snapshot-log" : [ {
"timestamp-ms" : 1641131156558,
"snapshot-id" : 5028042644139258397
}, {
"timestamp-ms" : 1641132476394,
"snapshot-id" : 1270004071302473053
} ],
"metadata-log" : [ {
"timestamp-ms" : 1641129739691,
"metadata-file" : "/Users/liliwei/plat/spark-3.1.2-bin-hadoop3.2/warehouse/db/table/metadata/v1.metadata.json"
}, {
"timestamp-ms" : 1641131156558,
"metadata-file" : "/Users/liliwei/plat/spark-3.1.2-bin-hadoop3.2/warehouse/db/table/metadata/v2.metadata.json"
} ]
}
snap-1270004071302473053-1-6881af48-5efa-4660-99ed-be5b9f640e52.avro文件內容如下:
{
"manifest_path": "/Users/liliwei/plat/spark-3.1.2-bin-hadoop3.2/warehouse/db/table/metadata/6881af48-5efa-4660-99ed-be5b9f640e52-m0.avro",
"manifest_length": 5802,
"partition_spec_id": 0,
"added_snapshot_id": {
"long": 1270004071302473053
},
"added_data_files_count": {
"int": 1
},
"existing_data_files_count": {
"int": 0
},
"deleted_data_files_count": {
"int": 0
},
"partitions": {
"array": []
},
"added_rows_count": {
"long": 1
},
"existing_rows_count": {
"long": 0
},
"deleted_rows_count": {
"long": 0
}
}
{
"manifest_path": "/Users/liliwei/plat/spark-3.1.2-bin-hadoop3.2/warehouse/db/table/metadata/1bd1f809-55ea-4ba1-b425-ab4ecc212434-m0.avro",
"manifest_length": 5803,
"partition_spec_id": 0,
"added_snapshot_id": {
"long": 5028042644139258397
},
"added_data_files_count": {
"int": 1
},
"existing_data_files_count": {
"int": 0
},
"deleted_data_files_count": {
"int": 0
},
"partitions": {
"array": []
},
"added_rows_count": {
"long": 1
},
"existing_rows_count": {
"long": 0
},
"deleted_rows_count": {
"long": 0
}
}
注意,以上是兩條數據西傀,分別是兩個JSON格式斤寇。
6881af48-5efa-4660-99ed-be5b9f640e52-m0.avro內容如下:
{
"status": 1,
"snapshot_id": {
"long": 1270004071302473053
},
"data_file": {
"file_path": "/Users/liliwei/plat/spark-3.1.2-bin-hadoop3.2/warehouse/db/table/data/00000-1-631cd5bc-2ad0-4ddd-9530-f055b2888d56-00001.parquet",
"file_format": "PARQUET",
"partition": {},
"record_count": 1,
"file_size_in_bytes": 643,
"block_size_in_bytes": 67108864,
"column_sizes": {
"array": [{
"key": 1,
"value": 46
}, {
"key": 2,
"value": 48
}]
},
"value_counts": {
"array": [{
"key": 1,
"value": 1
}, {
"key": 2,
"value": 1
}]
},
"null_value_counts": {
"array": [{
"key": 1,
"value": 0
}, {
"key": 2,
"value": 0
}]
},
"nan_value_counts": {
"array": []
},
"lower_bounds": {
"array": [{
"key": 1,
"value": "\u0002\u0000\u0000\u0000\u0000\u0000\u0000\u0000"
}, {
"key": 2,
"value": "b"
}]
},
"upper_bounds": {
"array": [{
"key": 1,
"value": "\u0002\u0000\u0000\u0000\u0000\u0000\u0000\u0000"
}, {
"key": 2,
"value": "b"
}]
},
"key_metadata": null,
"split_offsets": {
"array": [4]
},
"sort_order_id": {
"int": 0
}
}
}