From 8d00e4fff2d71771657412fbe3d201da4beee5ac Mon Sep 17 00:00:00 2001 From: Mikhail Melnik Date: Fri, 22 May 2026 12:52:41 +0200 Subject: [PATCH 1/3] Fix VariantUtil string decoding to use explicit UTF-8 charset --- .../org/apache/parquet/variant/VariantUtil.java | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/parquet-variant/src/main/java/org/apache/parquet/variant/VariantUtil.java b/parquet-variant/src/main/java/org/apache/parquet/variant/VariantUtil.java index 7ad867e0fd..7b8d477cab 100644 --- a/parquet-variant/src/main/java/org/apache/parquet/variant/VariantUtil.java +++ b/parquet-variant/src/main/java/org/apache/parquet/variant/VariantUtil.java @@ -20,6 +20,7 @@ import java.math.BigInteger; import java.nio.ByteBuffer; import java.nio.ByteOrder; +import java.nio.charset.StandardCharsets; import java.util.Arrays; import java.util.HashMap; import org.apache.parquet.Preconditions; @@ -657,12 +658,12 @@ static String getString(ByteBuffer value) { checkIndex(start + length - 1, value.limit()); if (value.hasArray()) { // If the buffer is backed by an array, we can use the array directly. - return new String(value.array(), value.arrayOffset() + start, length); + return new String(value.array(), value.arrayOffset() + start, length, StandardCharsets.UTF_8); } else { // If the buffer is not backed by an array, we need to copy the bytes into a new array. byte[] valueArray = new byte[length]; slice(value, start).get(valueArray); - return new String(valueArray); + return new String(valueArray, StandardCharsets.UTF_8); } } throw unexpectedType(Variant.Type.STRING, value); @@ -825,12 +826,13 @@ static String getMetadataKey(ByteBuffer metadata, int id) { } checkIndex(dataPos + nextOffset - 1, metadata.limit()); if (metadata.hasArray() && !metadata.isReadOnly()) { - return new String(metadata.array(), metadata.arrayOffset() + dataPos + offset, nextOffset - offset); + return new String( + metadata.array(), metadata.arrayOffset() + dataPos + offset, nextOffset - offset, StandardCharsets.UTF_8); } else { // ByteBuffer does not have an array, so we need to use the `get` method to read the bytes. byte[] metadataArray = new byte[nextOffset - offset]; slice(metadata, dataPos + offset).get(metadataArray); - return new String(metadataArray); + return new String(metadataArray, StandardCharsets.UTF_8); } } @@ -861,13 +863,14 @@ static HashMap getMetadataMap(ByteBuffer metadata) { new String( metadata.array(), metadata.arrayOffset() + pos + stringStart + offset, - nextOffset - offset), + nextOffset - offset, + StandardCharsets.UTF_8), id); } else { // ByteBuffer does not have an array, so we need to use the `get` method to read the bytes. byte[] metadataArray = new byte[nextOffset - offset]; slice(metadata, pos + stringStart + offset).get(metadataArray); - result.put(new String(metadataArray), id); + result.put(new String(metadataArray, StandardCharsets.UTF_8), id); } offset = nextOffset; } From dd1b3da83c13678688d8cd30cabb641aa722cba7 Mon Sep 17 00:00:00 2001 From: Mikhail Melnik Date: Sat, 23 May 2026 20:57:27 +0200 Subject: [PATCH 2/3] Apply Spotless formatting --- .../main/java/org/apache/parquet/variant/VariantUtil.java | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/parquet-variant/src/main/java/org/apache/parquet/variant/VariantUtil.java b/parquet-variant/src/main/java/org/apache/parquet/variant/VariantUtil.java index 7b8d477cab..f50a0f3162 100644 --- a/parquet-variant/src/main/java/org/apache/parquet/variant/VariantUtil.java +++ b/parquet-variant/src/main/java/org/apache/parquet/variant/VariantUtil.java @@ -827,7 +827,10 @@ static String getMetadataKey(ByteBuffer metadata, int id) { checkIndex(dataPos + nextOffset - 1, metadata.limit()); if (metadata.hasArray() && !metadata.isReadOnly()) { return new String( - metadata.array(), metadata.arrayOffset() + dataPos + offset, nextOffset - offset, StandardCharsets.UTF_8); + metadata.array(), + metadata.arrayOffset() + dataPos + offset, + nextOffset - offset, + StandardCharsets.UTF_8); } else { // ByteBuffer does not have an array, so we need to use the `get` method to read the bytes. byte[] metadataArray = new byte[nextOffset - offset]; From e528cf6f38d1283c6b01eabc1944860133a210ab Mon Sep 17 00:00:00 2001 From: Mikhail Melnik Date: Wed, 27 May 2026 01:17:53 +0200 Subject: [PATCH 3/3] Add tests for non-ASCII string values, object keys and metadata map --- .../parquet/variant/TestVariantObject.java | 25 +++++++++++++++++++ .../parquet/variant/TestVariantParseJson.java | 9 +++++++ 2 files changed, 34 insertions(+) diff --git a/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantObject.java b/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantObject.java index 1c823bd761..1c5dc4c5e2 100644 --- a/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantObject.java +++ b/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantObject.java @@ -406,4 +406,29 @@ public void testMetadataWithNonZeroPositionReadOnly() { Assert.assertEquals(0, immutableMetadata.getOrInsert("name")); Assert.assertEquals(1, immutableMetadata.getOrInsert("age")); } + + @Test + public void testMetadataMapWithUnicodeKeys() { + // Build a variant whose metadata dictionary contains non-ASCII keys. + VariantBuilder vb = new VariantBuilder(); + VariantObjectBuilder obj = vb.startObject(); + obj.appendKey("élève"); + obj.appendInt(1); + obj.appendKey("中文"); + obj.appendInt(2); + vb.endObject(); + Variant variant = vb.build(); + + ByteBuffer metaBuf = variant.getMetadataBuffer(); + + // hasArray branch + ImmutableMetadata writable = new ImmutableMetadata(metaBuf); + Assert.assertEquals(0, writable.getOrInsert("élève")); + Assert.assertEquals(1, writable.getOrInsert("中文")); + + // read-only branch (else path in getMetadataMap): asReadOnlyBuffer() makes isReadOnly() true + ImmutableMetadata readOnly = new ImmutableMetadata(metaBuf.asReadOnlyBuffer()); + Assert.assertEquals(0, readOnly.getOrInsert("élève")); + Assert.assertEquals(1, readOnly.getOrInsert("中文")); + } } diff --git a/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantParseJson.java b/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantParseJson.java index f2697a00ff..fc1a24ba2d 100644 --- a/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantParseJson.java +++ b/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantParseJson.java @@ -241,6 +241,15 @@ public void testParseUnicodeString() throws IOException { Assert.assertEquals("\u00e9l\u00e8ve", v.getString()); } + @Test + public void testParseUnicodeKey() throws IOException { + Variant v = VariantJsonParser.parseJson("{\"\\u00e9l\\u00e8ve\": 42}"); + Assert.assertEquals(Variant.Type.OBJECT, v.getType()); + Variant value = v.getFieldByKey("élève"); + Assert.assertNotNull(value); + Assert.assertEquals(42, value.getInt()); + } + @Test public void testParseEscapedString() throws IOException { Variant v = VariantJsonParser.parseJson("\"hello\\nworld\"");