diff --git a/parquet-variant/src/main/java/org/apache/parquet/variant/VariantUtil.java b/parquet-variant/src/main/java/org/apache/parquet/variant/VariantUtil.java index 7ad867e0fd..f50a0f3162 100644 --- a/parquet-variant/src/main/java/org/apache/parquet/variant/VariantUtil.java +++ b/parquet-variant/src/main/java/org/apache/parquet/variant/VariantUtil.java @@ -20,6 +20,7 @@ import java.math.BigInteger; import java.nio.ByteBuffer; import java.nio.ByteOrder; +import java.nio.charset.StandardCharsets; import java.util.Arrays; import java.util.HashMap; import org.apache.parquet.Preconditions; @@ -657,12 +658,12 @@ static String getString(ByteBuffer value) { checkIndex(start + length - 1, value.limit()); if (value.hasArray()) { // If the buffer is backed by an array, we can use the array directly. - return new String(value.array(), value.arrayOffset() + start, length); + return new String(value.array(), value.arrayOffset() + start, length, StandardCharsets.UTF_8); } else { // If the buffer is not backed by an array, we need to copy the bytes into a new array. byte[] valueArray = new byte[length]; slice(value, start).get(valueArray); - return new String(valueArray); + return new String(valueArray, StandardCharsets.UTF_8); } } throw unexpectedType(Variant.Type.STRING, value); @@ -825,12 +826,16 @@ static String getMetadataKey(ByteBuffer metadata, int id) { } checkIndex(dataPos + nextOffset - 1, metadata.limit()); if (metadata.hasArray() && !metadata.isReadOnly()) { - return new String(metadata.array(), metadata.arrayOffset() + dataPos + offset, nextOffset - offset); + return new String( + metadata.array(), + metadata.arrayOffset() + dataPos + offset, + nextOffset - offset, + StandardCharsets.UTF_8); } else { // ByteBuffer does not have an array, so we need to use the `get` method to read the bytes. byte[] metadataArray = new byte[nextOffset - offset]; slice(metadata, dataPos + offset).get(metadataArray); - return new String(metadataArray); + return new String(metadataArray, StandardCharsets.UTF_8); } } @@ -861,13 +866,14 @@ static HashMap getMetadataMap(ByteBuffer metadata) { new String( metadata.array(), metadata.arrayOffset() + pos + stringStart + offset, - nextOffset - offset), + nextOffset - offset, + StandardCharsets.UTF_8), id); } else { // ByteBuffer does not have an array, so we need to use the `get` method to read the bytes. byte[] metadataArray = new byte[nextOffset - offset]; slice(metadata, pos + stringStart + offset).get(metadataArray); - result.put(new String(metadataArray), id); + result.put(new String(metadataArray, StandardCharsets.UTF_8), id); } offset = nextOffset; } diff --git a/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantObject.java b/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantObject.java index 1c823bd761..1c5dc4c5e2 100644 --- a/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantObject.java +++ b/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantObject.java @@ -406,4 +406,29 @@ public void testMetadataWithNonZeroPositionReadOnly() { Assert.assertEquals(0, immutableMetadata.getOrInsert("name")); Assert.assertEquals(1, immutableMetadata.getOrInsert("age")); } + + @Test + public void testMetadataMapWithUnicodeKeys() { + // Build a variant whose metadata dictionary contains non-ASCII keys. + VariantBuilder vb = new VariantBuilder(); + VariantObjectBuilder obj = vb.startObject(); + obj.appendKey("élève"); + obj.appendInt(1); + obj.appendKey("中文"); + obj.appendInt(2); + vb.endObject(); + Variant variant = vb.build(); + + ByteBuffer metaBuf = variant.getMetadataBuffer(); + + // hasArray branch + ImmutableMetadata writable = new ImmutableMetadata(metaBuf); + Assert.assertEquals(0, writable.getOrInsert("élève")); + Assert.assertEquals(1, writable.getOrInsert("中文")); + + // read-only branch (else path in getMetadataMap): asReadOnlyBuffer() makes isReadOnly() true + ImmutableMetadata readOnly = new ImmutableMetadata(metaBuf.asReadOnlyBuffer()); + Assert.assertEquals(0, readOnly.getOrInsert("élève")); + Assert.assertEquals(1, readOnly.getOrInsert("中文")); + } } diff --git a/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantParseJson.java b/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantParseJson.java index f2697a00ff..fc1a24ba2d 100644 --- a/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantParseJson.java +++ b/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantParseJson.java @@ -241,6 +241,15 @@ public void testParseUnicodeString() throws IOException { Assert.assertEquals("\u00e9l\u00e8ve", v.getString()); } + @Test + public void testParseUnicodeKey() throws IOException { + Variant v = VariantJsonParser.parseJson("{\"\\u00e9l\\u00e8ve\": 42}"); + Assert.assertEquals(Variant.Type.OBJECT, v.getType()); + Variant value = v.getFieldByKey("élève"); + Assert.assertNotNull(value); + Assert.assertEquals(42, value.getInt()); + } + @Test public void testParseEscapedString() throws IOException { Variant v = VariantJsonParser.parseJson("\"hello\\nworld\"");